From 2f5f6ad9390c1ebbf738d130dbfe80b60eaa167e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Aug 2011 16:57:47 -0400 Subject: ftrace: Pass ftrace_ops as third parameter to function trace callback Currently the function trace callback receives only the ip and parent_ip of the function that it traced. It would be more powerful to also return the ops that registered the function as well. This allows the same function to act differently depending on what ftrace_ops registered it. Link: http://lkml.kernel.org/r/20120612225424.267254552@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 101 ++++++++++++++++++++++++++------------ kernel/trace/trace_event_perf.c | 3 +- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_functions.c | 9 ++-- kernel/trace/trace_irqsoff.c | 3 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 15 ++++-- kernel/trace/trace_stack.c | 2 +- 8 files changed, 94 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4f20fba09fc..4f2ab9352a68 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -64,12 +64,19 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +static struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, +}; + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* Quick disabling of function tracer. */ -int function_trace_stop; +int function_trace_stop __read_mostly; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -86,10 +93,6 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, -}; - static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; @@ -100,8 +103,14 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif /* * Traverse the ftrace_global_list, invoking all entries. The reason that we @@ -112,29 +121,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_global_list_func(unsigned long ip, - unsigned long parent_ip) +static void +ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) return; trace_recursion_set(TRACE_GLOBAL_BIT); op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { - op->func(ip, parent_ip); + op->func(ip, parent_ip, op); op = rcu_dereference_raw(op->next); /*see above*/ }; trace_recursion_clear(TRACE_GLOBAL_BIT); } -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip); + ftrace_pid_function(ip, parent_ip, op); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -163,12 +172,13 @@ void clear_ftrace_function(void) * For those archs that do not test ftrace_trace_stop in their * mcount call site, we need to do it from C. */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { if (function_trace_stop) return; - __ftrace_trace_function(ip, parent_ip); + __ftrace_trace_function(ip, parent_ip, op); } #endif @@ -230,15 +240,24 @@ static void update_ftrace_function(void) /* * If we are at the end of the list and this ops is - * not dynamic, then have the mcount trampoline call - * the function directly + * not dynamic and the arch supports passing ops, then have the + * mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + ARCH_SUPPORTS_FTRACE_OPS)) { + /* Set the ftrace_ops that the arch callback uses */ + if (ftrace_ops_list == &global_ops) + function_trace_op = ftrace_global_list; + else + function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; - else + } else { + /* Just use the default ftrace_ops */ + function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; + } #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; @@ -773,7 +792,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) } static void -function_profile_call(unsigned long ip, unsigned long parent_ip) +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -803,7 +823,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0); + function_profile_call(trace->func, 0, NULL); return 1; } @@ -2790,8 +2810,8 @@ static int __init ftrace_mod_cmd_init(void) } device_initcall(ftrace_mod_cmd_init); -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct ftrace_func_probe *entry; struct hlist_head *hhd; @@ -3942,10 +3962,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) return; @@ -3959,7 +3978,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) while (op != &ftrace_list_end) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op); op = rcu_dereference_raw(op->next); }; @@ -3971,8 +3990,9 @@ static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, }; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored) { struct ftrace_ops *op; @@ -3988,13 +4008,32 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op); op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL); +} +#endif + static void clear_ftrace_swapper(void) { struct task_struct *p; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f6..a872a9a298a0 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void -perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops) { struct ftrace_entry *entry; struct hlist_head *head; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29111da1d100..88daa5177bf4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1681,7 +1681,8 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct ring_buffer_event *event; struct ring_buffer *buffer; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db09..fceb7a9aa06d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -48,7 +48,8 @@ static void function_trace_start(struct trace_array *tr) } static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -75,7 +76,8 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) } static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -106,7 +108,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e920368..2862c77f95d9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr, * irqsoff uses its own tracer function to keep the overhead down: */ static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..0caf4f5da569 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -108,7 +108,7 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977fb..9ae40c823af8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -103,35 +103,40 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_probe1_cnt++; } static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_probe2_cnt++; } static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_probe3_cnt++; } static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_global_cnt++; } static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_dyn_cnt++; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242e..e20006d5fb6a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,7 @@ static inline void check_stack(void) } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) +stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) { int cpu; -- cgit v1.2.2 From ccf3672d530170c98c734dfc5db07d64bcbad2ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jun 2012 09:44:25 -0400 Subject: ftrace: Consolidate arch dependent functions with 'list' function As the function tracer starts to get more features, the support for theses features will spread out throughout the different architectures over time. These features boil down to what each arch does in the mcount trampoline (the ftrace_caller). Currently there's two features that are not the same throughout the archs. 1) Support to stop function tracing before the callback 2) passing of the ftrace ops Both of these require placing an indirect function to support the features if the mcount trampoline does not. On a side note, for all architectures, when more than one callback is registered to the function tracer, an intermediate 'list' function is called by the mcount trampoline to iterate through the callbacks that are registered. Instead of making a separate function for each of these features, and requiring several indirect calls, just use the single 'list' function as the intermediate, to handle all cases. If an arch does not support the 'stop function tracing' or the passing of ftrace ops, just force it to use the list function that will handle the features required. This makes the code cleaner and simpler and removes a lot of #ifdefs in the code. Link: http://lkml.kernel.org/r/20120612225424.495625483@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 45 ++++----------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f2ab9352a68..4cbca2e6eb70 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -97,8 +97,6 @@ static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; @@ -162,26 +160,9 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - __ftrace_trace_function = ftrace_stub; - __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) -{ - if (function_trace_stop) - return; - - __ftrace_trace_function(ip, parent_ip, op); -} -#endif - static void control_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -246,7 +227,7 @@ static void update_ftrace_function(void) if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - ARCH_SUPPORTS_FTRACE_OPS)) { + !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) function_trace_op = ftrace_global_list; @@ -259,18 +240,7 @@ static void update_ftrace_function(void) func = ftrace_ops_list_func; } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; -#else -#ifdef CONFIG_DYNAMIC_FTRACE - /* do not update till all functions have been modified */ - __ftrace_trace_function_delay = func; -#else - __ftrace_trace_function = func; -#endif - ftrace_trace_function = - (func == ftrace_stub) ? func : ftrace_test_stop_func; -#endif } static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) @@ -1902,16 +1872,6 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); @@ -3996,6 +3956,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { struct ftrace_ops *op; + if (function_trace_stop) + return; + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) return; -- cgit v1.2.2 From a1e2e31d175a1349274eba3465d17616c6725f8c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Aug 2011 12:50:46 -0400 Subject: ftrace: Return pt_regs to function trace callback Return as the 4th paramater to the function tracer callback the pt_regs. Later patches that implement regs passing for the architectures will require having the ftrace_ops set the SAVE_REGS flag, which will tell the arch to take the time to pass a full set of pt_regs to the ftrace_ops callback function. If the arch does not support it then it should pass NULL. If an arch can pass full regs, then it should define: ARCH_SUPPORTS_FTRACE_SAVE_REGS to 1 Link: http://lkml.kernel.org/r/20120702201821.019966811@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 37 ++++++++++++++++++++++--------------- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c | 7 ++++--- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 3 ++- kernel/trace/trace_selftest.c | 15 ++++++++++----- kernel/trace/trace_stack.c | 3 ++- 8 files changed, 43 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4cbca2e6eb70..6ff07ad0ede3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -103,7 +103,7 @@ static struct ftrace_ops control_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op); + struct ftrace_ops *op, struct pt_regs *regs); #else /* See comment below, where ftrace_ops_list_func is defined */ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); @@ -121,7 +121,7 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) return; @@ -129,19 +129,19 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, trace_recursion_set(TRACE_GLOBAL_BIT); op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { - op->func(ip, parent_ip, op); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); /*see above*/ }; trace_recursion_clear(TRACE_GLOBAL_BIT); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip, op); + ftrace_pid_function(ip, parent_ip, op, regs); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -763,7 +763,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) static void function_profile_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops) + struct ftrace_ops *ops, struct pt_regs *regs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -793,7 +793,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0, NULL); + function_profile_call(trace->func, 0, NULL, NULL); return 1; } @@ -2771,7 +2771,7 @@ static int __init ftrace_mod_cmd_init(void) device_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_func_probe *entry; struct hlist_head *hhd; @@ -3923,7 +3923,7 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static void ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) return; @@ -3938,7 +3938,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, while (op != &ftrace_list_end) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) - op->func(ip, parent_ip, op); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; @@ -3952,7 +3952,7 @@ static struct ftrace_ops control_ops = { static inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ignored) + struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; @@ -3971,7 +3971,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip, op); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); @@ -3983,17 +3983,24 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * the list function ignores the op parameter, we do not want any * C side effects, where a function is called without the caller * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. + * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { - __ftrace_ops_list_func(ip, parent_ip, NULL); + __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { - __ftrace_ops_list_func(ip, parent_ip, NULL); + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } #endif diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a872a9a298a0..9824419c8404 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -259,7 +259,7 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops) + struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; struct hlist_head *head; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 88daa5177bf4..8c6696833686 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1682,7 +1682,7 @@ static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ring_buffer_event *event; struct ring_buffer *buffer; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fceb7a9aa06d..5675ebd541f0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -49,7 +49,7 @@ static void function_trace_start(struct trace_array *tr) static void function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -77,7 +77,8 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) + { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -109,7 +110,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2862c77f95d9..c7a9ba936de6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -137,7 +137,7 @@ static int func_prolog_dec(struct trace_array *tr, */ static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0caf4f5da569..7547e36d483e 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -108,7 +108,8 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9ae40c823af8..add37e019fd0 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -104,7 +104,8 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe1_cnt++; } @@ -112,7 +113,8 @@ static void trace_selftest_test_probe1_func(unsigned long ip, static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe2_cnt++; } @@ -120,7 +122,8 @@ static void trace_selftest_test_probe2_func(unsigned long ip, static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe3_cnt++; } @@ -128,7 +131,8 @@ static void trace_selftest_test_probe3_func(unsigned long ip, static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_global_cnt++; } @@ -136,7 +140,8 @@ static void trace_selftest_test_global_func(unsigned long ip, static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_dyn_cnt++; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e20006d5fb6a..2fa5328e8893 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,8 @@ static inline void check_stack(void) } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { int cpu; -- cgit v1.2.2 From 08f6fba503111e0336f2b4d6915a4a18f9b60e51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Apr 2012 16:20:23 -0400 Subject: ftrace/x86: Add separate function to save regs Add a way to have different functions calling different trampolines. If a ftrace_ops wants regs saved on the return, then have only the functions with ops registered to save regs. Functions registered by other ops would not be affected, unless the functions overlap. If one ftrace_ops registered functions A, B and C and another ops registered fucntions to save regs on A, and D, then only functions A and D would be saving regs. Function B and C would work as normal. Although A is registered by both ops: normal and saves regs; this is fine as saving the regs is needed to satisfy one of the ops that calls it but the regs are ignored by the other ops function. x86_64 implements the full regs saving, and i386 just passes a NULL for regs to satisfy the ftrace_ops passing. Where an arch must supply both regs and ftrace_ops parameters, even if regs is just NULL. It is OK for an arch to pass NULL regs. All function trace users that require regs passing must add the flag FTRACE_OPS_FL_SAVE_REGS when registering the ftrace_ops. If the arch does not support saving regs then the ftrace_ops will fail to register. The flag FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED may be set that will prevent the ftrace_ops from failing to register. In this case, the handler may either check if regs is not NULL or check if ARCH_SUPPORTS_FTRACE_SAVE_REGS. If the arch supports passing regs it will set this macro and pass regs for ops that request them. All other archs will just pass NULL. Link: Link: http://lkml.kernel.org/r/20120711195745.107705970@goodmis.org Cc: Alexander van Heukelum Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 83 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6ff07ad0ede3..c55f7e274613 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -314,6 +314,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; +#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -1515,6 +1529,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags++; if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) return; + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) return; @@ -1606,18 +1626,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; + /* + * If enabling and the REGS flag does not match the REGS_EN, then + * do not ignore this record. Set flags to fail the compare against + * ENABLED. + */ + if (flag && + (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) + flag |= FTRACE_FL_REGS; + /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) return FTRACE_UPDATE_IGNORE; if (flag) { - if (update) + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { rec->flags |= FTRACE_FL_ENABLED; - return FTRACE_UPDATE_MAKE_CALL; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, if the EN flag is set, then return + * UPDATE_MODIFY_CALL_REGS to tell the caller to convert + * from the non-save regs, to a save regs function. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function. + */ + if (flag & FTRACE_FL_ENABLED) + return FTRACE_UPDATE_MAKE_CALL; + else if (rec->flags & FTRACE_FL_REGS_EN) + return FTRACE_UPDATE_MODIFY_CALL_REGS; + else + return FTRACE_UPDATE_MODIFY_CALL; } - if (update) - rec->flags &= ~FTRACE_FL_ENABLED; + if (update) { + /* If there's no more users, clear all flags */ + if (!(rec->flags & ~FTRACE_FL_MASK)) + rec->flags = 0; + else + /* Just disable the record (keep REGS state) */ + rec->flags &= ~FTRACE_FL_ENABLED; + } return FTRACE_UPDATE_MAKE_NOP; } @@ -1652,13 +1713,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + unsigned long ftrace_old_addr; unsigned long ftrace_addr; int ret; - ftrace_addr = (unsigned long)FTRACE_ADDR; - ret = ftrace_update_record(rec, enable); + if (rec->flags & FTRACE_FL_REGS) + ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; + else + ftrace_addr = (unsigned long)FTRACE_ADDR; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; @@ -1668,6 +1733,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); + + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + if (rec->flags & FTRACE_FL_REGS) + ftrace_old_addr = (unsigned long)FTRACE_ADDR; + else + ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; + + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } return -1; /* unknow ftrace bug */ @@ -2421,8 +2495,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)", - rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, " (%ld)%s", + rec->flags & ~FTRACE_FL_MASK, + rec->flags & FTRACE_FL_REGS ? " R" : ""); seq_printf(m, "\n"); return 0; -- cgit v1.2.2 From 4740974a6844156c14d741b0080b59d275679a23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 11:04:44 -0400 Subject: ftrace: Add default recursion protection for function tracing As more users of the function tracer utility are being added, they do not always add the necessary recursion protection. To protect from function recursion due to tracing, if the callback ftrace_ops does not specifically specify that it protects against recursion (by setting the FTRACE_OPS_FL_RECURSION_SAFE flag), the list operation will be called by the mcount trampoline which adds recursion protection. If the flag is set, then the function will be called directly with no extra protection. Note, the list operation is called if more than one function callback is registered, or if the arch does not support all of the function tracer features. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 ++++++++-- kernel/trace/trace_events.c | 1 + kernel/trace/trace_functions.c | 4 ++-- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 7 +++++-- kernel/trace/trace_stack.c | 1 + 7 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c55f7e274613..ad765b4ba426 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -221,12 +222,13 @@ static void update_ftrace_function(void) /* * If we are at the end of the list and this ops is - * not dynamic and the arch supports passing ops, then have the - * mcount trampoline call the function directly. + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) @@ -867,6 +869,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int register_ftrace_profiler(void) @@ -1049,6 +1052,7 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static DEFINE_MUTEX(ftrace_regex_lock); @@ -3967,6 +3971,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int __init ftrace_nodyn_init(void) @@ -4023,6 +4028,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static inline void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8c6696833686..6825d833a257 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1721,6 +1721,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5675ebd541f0..fdff65dff1bb 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -153,13 +153,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; /* Our two options */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c7a9ba936de6..d98ee8283b29 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -154,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7547e36d483e..02170c00c413 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index add37e019fd0..1fb6da85ff8b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -148,19 +148,22 @@ static void trace_selftest_test_dyn_func(unsigned long ip, static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL, + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2fa5328e8893..0c1b165778e5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -137,6 +137,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t -- cgit v1.2.2 From 47239c4d8d6a24796039cada69d477a2b8cac9d6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 11:13:07 -0400 Subject: ftrace: Only compile ftrace selftest if selftests are enabled No need to compile in the ftrace selftest helper file if selftests are not being executed. Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b831087c8200..837090808aac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +endif # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES -- cgit v1.2.2 From ea701f11da44b44907af226fe5a5f57d2f26eeb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 13:08:05 -0400 Subject: ftrace: Add selftest to test function trace recursion protection Add selftests to test the function tracing recursion protection actually does work. It also tests if a ftrace_ops states it will perform its own protection. Although, even if the ftrace_ops states it will protect itself, the ftrace infrastructure may still provide protection if the arch does not support all features or another ftrace_ops is registered. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 +++++++ kernel/trace/trace_selftest.c | 136 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ad765b4ba426..528d997c7f99 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,27 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void) +{ + struct ftrace_ops *ops; + int cnt = 0; + + mutex_lock(&ftrace_lock); + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) + cnt++; + + mutex_unlock(&ftrace_lock); + + return cnt; +} + /* * Traverse the ftrace_global_list, invoking all entries. The reason that we * can use rcu_dereference_raw() is that elements removed from this list diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 1fb6da85ff8b..86422f91dbe1 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -406,8 +406,141 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, return ret; } + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + trace_selftest_recursion_cnt++; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + int len; + int ret; + int cnt; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 1) { + pr_cont("*callback not called once (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + /* + * If arch supports all ftrace features, and no other task + * was on the list, we should be fine. + */ + if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) + cnt = 2; /* Should have recursed */ + else + cnt = 1; + + ret = -1; + if (trace_selftest_recursion_cnt != cnt) { + pr_cont("*callback not called expected %d times (%d)* ", + cnt, trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + return ret; +} #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ /* @@ -455,7 +588,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ret = trace_selftest_startup_dynamic_tracing(trace, tr, DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + ret = trace_selftest_function_recursion(); out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; -- cgit v1.2.2 From ad97772ad82f57c83968079d0880c71ab126ab04 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 13:45:59 -0400 Subject: ftrace: Add selftest to test function save-regs support Add selftests to test the save-regs functionality of ftrace. If the arch supports saving regs, then it will make sure that regs is at least not NULL in the callback. If the arch does not support saving regs, it makes sure that the registering of the ftrace_ops that requests saving regs fails. It then tests the registering of the ftrace_ops succeeds if the 'IF_SUPPORTED' flag is set. Then it makes sure that the regs passed to the function is NULL. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_selftest.c | 114 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..593debefc4e9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -472,11 +472,11 @@ extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -#endif extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 86422f91dbe1..1003a4d5eb25 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -543,6 +543,116 @@ out: # define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + if (pt_regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + return ret; +} + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -592,6 +702,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) goto out; ret = trace_selftest_function_recursion(); + if (ret) + goto out; + + ret = trace_selftest_function_regs(); out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; -- cgit v1.2.2 From 647664eaf4033501739ac1f42dd52ce8c9266ccc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:08 +0900 Subject: ftrace: add ftrace_set_filter_ip() for address based filter Add a new filter update interface ftrace_set_filter_ip() to set ftrace filter by ip address, not only glob pattern. Link: http://lkml.kernel.org/r/20120605102808.27845.67952.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 528d997c7f99..9dcf15d38380 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3242,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable) +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + if (!ftrace_location(ip)) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + return add_hash_entry(hash, ip); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -3272,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } + if (ip) { + ret = ftrace_match_addr(hash, ip, remove); + if (ret < 0) + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3288,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, return ret; } +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, + int reset, int enable) +{ + return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @ops - the ops to set the filter with -- cgit v1.2.2 From 72ef3794c5cd5f5f0e6355c24a529224c449cd14 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jun 2012 19:28:14 +0900 Subject: kprobes: Inverse taking of module_mutex with kprobe_mutex Currently module_mutex is taken before kprobe_mutex, but this can cause issues when we have kprobes register ftrace, as the ftrace mutex is taken before enabling a tracepoint, which currently takes the module mutex. If module_mutex is taken before kprobe_mutex, then we can not have kprobes use the ftrace infrastructure. There seems to be no reason that the kprobe_mutex can't be taken before the module_mutex. Running lockdep shows that it is safe among the kernels I've run. Link: http://lkml.kernel.org/r/20120605102814.27845.21047.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c62b8546cc90..7a8a1222c7b1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) { LIST_HEAD(free_list); + mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(&free_list); - mutex_unlock(&kprobe_mutex); mutex_unlock(&module_mutex); + mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) -- cgit v1.2.2 From f7fa6ef0ded995aad68650a877198f70e44b7621 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:20 +0900 Subject: kprobes: cleanup to separate probe-able check Separate probe-able address checking code from register_kprobe(). Link: http://lkml.kernel.org/r/20120605102820.27845.90133.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 82 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7a8a1222c7b1..6137fe32b4b8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1313,67 +1313,80 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -int __kprobes register_kprobe(struct kprobe *p) +static __kprobes int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; - struct kprobe *old_p; - struct module *probed_mod; - kprobe_opcode_t *addr; - - addr = kprobe_addr(p); - if (IS_ERR(addr)) - return PTR_ERR(addr); - p->addr = addr; - - ret = check_kprobe_rereg(p); - if (ret) - return ret; jump_label_lock(); preempt_disable(); + + /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; - goto cannot_probe; + goto out; } - /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ - p->flags &= KPROBE_FLAG_DISABLED; - - /* - * Check if are we probing a module. - */ - probed_mod = __module_text_address((unsigned long) p->addr); - if (probed_mod) { - /* Return -ENOENT if fail. */ - ret = -ENOENT; + /* Check if are we probing a module */ + *probed_mod = __module_text_address((unsigned long) p->addr); + if (*probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) - goto cannot_probe; + if (unlikely(!try_module_get(*probed_mod))) { + ret = -ENOENT; + goto out; + } /* * If the module freed .init.text, we couldn't insert * kprobes in there. */ - if (within_module_init((unsigned long)p->addr, probed_mod) && - probed_mod->state != MODULE_STATE_COMING) { - module_put(probed_mod); - goto cannot_probe; + if (within_module_init((unsigned long)p->addr, *probed_mod) && + (*probed_mod)->state != MODULE_STATE_COMING) { + module_put(*probed_mod); + *probed_mod = NULL; + ret = -ENOENT; } - /* ret will be updated by following code */ } +out: preempt_enable(); jump_label_unlock(); + return ret; +} + +int __kprobes register_kprobe(struct kprobe *p) +{ + int ret; + struct kprobe *old_p; + struct module *probed_mod; + kprobe_opcode_t *addr; + + /* Adjust probe address from symbol */ + addr = kprobe_addr(p); + if (IS_ERR(addr)) + return PTR_ERR(addr); + p->addr = addr; + + ret = check_kprobe_rereg(p); + if (ret) + return ret; + + /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ + p->flags &= KPROBE_FLAG_DISABLED; p->nmissed = 0; INIT_LIST_HEAD(&p->list); - mutex_lock(&kprobe_mutex); + ret = check_kprobe_address_safe(p, &probed_mod); + if (ret) + return ret; + + mutex_lock(&kprobe_mutex); jump_label_lock(); /* needed to call jump_label_text_reserved() */ get_online_cpus(); /* For avoiding text_mutex deadlock. */ @@ -1410,11 +1423,6 @@ out: module_put(probed_mod); return ret; - -cannot_probe: - preempt_enable(); - jump_label_unlock(); - return ret; } EXPORT_SYMBOL_GPL(register_kprobe); -- cgit v1.2.2 From 25764288d8dc4792f0f487baf043ccfee5d8c2ba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:26 +0900 Subject: kprobes: Move locks into appropriate functions Break a big critical region into fine-grained pieces at registering kprobe path. This helps us to solve circular locking dependency when introducing ftrace-based kprobes. Link: http://lkml.kernel.org/r/20120605102826.27845.81689.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 63 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6137fe32b4b8..9e47f44f3531 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -759,20 +759,28 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) struct kprobe *ap; struct optimized_kprobe *op; + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + mutex_lock(&text_mutex); + ap = alloc_aggr_kprobe(p); if (!ap) - return; + goto out; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ arch_remove_optimized_kprobe(op); kfree(op); - return; + goto out; } init_aggr_kprobe(ap, p); - optimize_kprobe(ap); + optimize_kprobe(ap); /* This just kicks optimizer thread */ + +out: + mutex_unlock(&text_mutex); + jump_label_unlock(); } #ifdef CONFIG_SYSCTL @@ -1144,12 +1152,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; - if (kprobe_disabled(ap) && !kprobe_disabled(p)) { - ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) - /* Arm the breakpoint again. */ - __arm_kprobe(ap); - } return 0; } @@ -1189,11 +1191,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, int ret = 0; struct kprobe *ap = orig_p; + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + /* + * Get online CPUs to avoid text_mutex deadlock.with stop machine, + * which is invoked by unoptimize_kprobe() in add_new_kprobe() + */ + get_online_cpus(); + mutex_lock(&text_mutex); + if (!kprobe_aggrprobe(orig_p)) { /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ ap = alloc_aggr_kprobe(orig_p); - if (!ap) - return -ENOMEM; + if (!ap) { + ret = -ENOMEM; + goto out; + } init_aggr_kprobe(ap, orig_p); } else if (kprobe_unused(ap)) /* This probe is going to die. Rescue it */ @@ -1213,7 +1226,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, * free aggr_probe. It will be used next time, or * freed by unregister_kprobe. */ - return ret; + goto out; /* Prepare optimized instructions if possible. */ prepare_optimized_kprobe(ap); @@ -1228,7 +1241,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, /* Copy ap's insn slot to p */ copy_kprobe(ap, p); - return add_new_kprobe(ap, p); + ret = add_new_kprobe(ap, p); + +out: + mutex_unlock(&text_mutex); + put_online_cpus(); + jump_label_unlock(); + + if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { + ap->flags &= ~KPROBE_FLAG_DISABLED; + if (!kprobes_all_disarmed) + /* Arm the breakpoint again. */ + arm_kprobe(ap); + } + return ret; } static int __kprobes in_kprobes_functions(unsigned long addr) @@ -1387,10 +1413,6 @@ int __kprobes register_kprobe(struct kprobe *p) return ret; mutex_lock(&kprobe_mutex); - jump_label_lock(); /* needed to call jump_label_text_reserved() */ - - get_online_cpus(); /* For avoiding text_mutex deadlock. */ - mutex_lock(&text_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -1399,7 +1421,9 @@ int __kprobes register_kprobe(struct kprobe *p) goto out; } + mutex_lock(&text_mutex); /* Avoiding text modification */ ret = arch_prepare_kprobe(p); + mutex_unlock(&text_mutex); if (ret) goto out; @@ -1408,15 +1432,12 @@ int __kprobes register_kprobe(struct kprobe *p) &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); /* Try to optimize kprobe */ try_to_optimize_kprobe(p); out: - mutex_unlock(&text_mutex); - put_online_cpus(); - jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) -- cgit v1.2.2 From ae6aa16fdc163afe6b04b6c073ad4ddd4663c03b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:32 +0900 Subject: kprobes: introduce ftrace based optimization Introduce function trace based kprobes optimization. With using ftrace optimization, kprobes on the mcount calling address, use ftrace's mcount call instead of breakpoint. Furthermore, this optimization works with preemptive kernel not like as current jump-based optimization. Of cource, this feature works only if the probe is on mcount call. Only if kprobe.break_handler is set, that probe is not optimized with ftrace (nor put on ftrace). The reason why this limitation comes is that this break_handler may be used only from jprobes which changes ip address (for fetching the function arguments), but function tracer ignores modified ip address. Changes in v2: - Fix ftrace_ops registering right after setting its filter. - Unregister ftrace_ops if there is no kprobe using. - Remove notrace dependency from __kprobes macro. Link: http://lkml.kernel.org/r/20120605102832.27845.63461.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 92 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e47f44f3531..69c16efc315b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -759,6 +759,10 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) struct kprobe *ap; struct optimized_kprobe *op; + /* Impossible to optimize ftrace-based kprobe */ + if (kprobe_ftrace(p)) + return; + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); mutex_lock(&text_mutex); @@ -915,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ +#ifdef KPROBES_CAN_USE_FTRACE +static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { + .regs_func = kprobe_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; +static int kprobe_ftrace_enabled; + +/* Must ensure p->addr is really on ftrace */ +static int __kprobes prepare_kprobe(struct kprobe *p) +{ + if (!kprobe_ftrace(p)) + return arch_prepare_kprobe(p); + + return arch_prepare_kprobe_ftrace(p); +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 0, 0); + WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + kprobe_ftrace_enabled++; + if (kprobe_ftrace_enabled == 1) { + ret = register_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + kprobe_ftrace_enabled--; + if (kprobe_ftrace_enabled == 0) { + ret = unregister_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 1, 0); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); +} +#else /* !KPROBES_CAN_USE_FTRACE */ +#define prepare_kprobe(p) arch_prepare_kprobe(p) +#define arm_kprobe_ftrace(p) do {} while (0) +#define disarm_kprobe_ftrace(p) do {} while (0) +#endif + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + if (unlikely(kprobe_ftrace(kp))) { + arm_kprobe_ftrace(kp); + return; + } /* * Here, since __arm_kprobe() doesn't use stop_machine(), * this doesn't cause deadlock on text_mutex. So, we don't @@ -929,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp) +static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) { + if (unlikely(kprobe_ftrace(kp))) { + disarm_kprobe_ftrace(kp); + return; + } /* Ditto */ mutex_lock(&text_mutex); - __disarm_kprobe(kp, true); + __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); } @@ -1343,6 +1406,26 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, struct module **probed_mod) { int ret = 0; + unsigned long ftrace_addr; + + /* + * If the address is located on a ftrace nop, set the + * breakpoint to the following instruction. + */ + ftrace_addr = ftrace_location((unsigned long)p->addr); + if (ftrace_addr) { +#ifdef KPROBES_CAN_USE_FTRACE + /* Given address is not on the instruction boundary */ + if ((unsigned long)p->addr != ftrace_addr) + return -EILSEQ; + /* break_handler (jprobe) can not work with ftrace */ + if (p->break_handler) + return -EINVAL; + p->flags |= KPROBE_FLAG_FTRACE; +#else /* !KPROBES_CAN_USE_FTRACE */ + return -EINVAL; +#endif + } jump_label_lock(); preempt_disable(); @@ -1350,7 +1433,6 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; goto out; @@ -1422,7 +1504,7 @@ int __kprobes register_kprobe(struct kprobe *p) } mutex_lock(&text_mutex); /* Avoiding text modification */ - ret = arch_prepare_kprobe(p); + ret = prepare_kprobe(p); mutex_unlock(&text_mutex); if (ret) goto out; @@ -1480,7 +1562,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p); + disarm_kprobe(orig_p, true); orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -2078,10 +2160,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, if (!pp) pp = p; - seq_printf(pi, "%s%s%s\n", + seq_printf(pi, "%s%s%s%s\n", (kprobe_gone(p) ? "[GONE]" : ""), ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), - (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); + (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), + (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -2160,14 +2243,12 @@ static void __kprobes arm_all_kprobes(void) goto already_enabled; /* Arming kprobes doesn't optimize kprobe itself */ - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); } - mutex_unlock(&text_mutex); kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); @@ -2195,15 +2276,13 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p, false); + disarm_kprobe(p, false); } } - mutex_unlock(&text_mutex); mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ -- cgit v1.2.2 From e52538965119319447c0800c534da73142c27be2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:38 +0900 Subject: kprobes/x86: ftrace based optimization for x86 Add function tracer based kprobe optimization support handlers on x86. This allows kprobes to use function tracer for probing on mcount call. Link: http://lkml.kernel.org/r/20120605102838.27845.26317.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu [ Updated to new port of ftrace save regs functions ] Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 69c16efc315b..35b4315d84f5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -921,7 +921,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef KPROBES_CAN_USE_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { - .regs_func = kprobe_ftrace_handler, + .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, }; static int kprobe_ftrace_enabled; -- cgit v1.2.2 From 9f99798ff49e73dded73a8c674044ea6fb6af651 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 31 Jul 2012 20:37:00 +0900 Subject: ptrace: mark __ptrace_may_access() static __ptrace_may_access() is used within only kernel/ptrace.c. Signed-off-by: Tetsuo Handa Signed-off-by: James Morris --- kernel/ptrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a232bb59d93f..1f5e55dda955 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return has_ns_capability(current, ns, CAP_SYS_PTRACE); } -int __ptrace_may_access(struct task_struct *task, unsigned int mode) +/* Returns 0 on success, -errno on denial. */ +static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; -- cgit v1.2.2 From 0a13c00e9d4502b8e3fd9260ce781758ff2c3970 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:44 -0700 Subject: workqueue: reorder queueing functions so that _on() variants are on top Currently, queue/schedule[_delayed]_work_on() are located below the counterpart without the _on postifx even though the latter is usually implemented using the former. Swap them. This is cleanup and doesn't cause any functional difference. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 124 ++++++++++++++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a10..07d309e7e359 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1052,27 +1052,6 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, spin_unlock_irqrestore(&gcwq->lock, flags); } -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - int ret; - - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); - - return ret; -} -EXPORT_SYMBOL_GPL(queue_work); - /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on @@ -1097,31 +1076,34 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work_on); -static void delayed_work_timer_fn(unsigned long __data) -{ - struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - - __queue_work(smp_processor_id(), cwq->wq, &dwork->work); -} - /** - * queue_delayed_work - queue work on a workqueue after delay + * queue_work - queue work on a workqueue * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing + * @work: work to queue * * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ -int queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +int queue_work(struct workqueue_struct *wq, struct work_struct *work) { - if (delay == 0) - return queue_work(wq, &dwork->work); + int ret; - return queue_delayed_work_on(-1, wq, dwork, delay); + ret = queue_work_on(get_cpu(), wq, work); + put_cpu(); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_work); + +static void delayed_work_timer_fn(unsigned long __data) +{ + struct delayed_work *dwork = (struct delayed_work *)__data; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); + + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); } -EXPORT_SYMBOL_GPL(queue_delayed_work); /** * queue_delayed_work_on - queue work on specific CPU after delay @@ -1178,6 +1160,24 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + if (delay == 0) + return queue_work(wq, &dwork->work); + + return queue_delayed_work_on(-1, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); + /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -2877,6 +2877,19 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); +/* + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +int schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, system_wq, work); +} +EXPORT_SYMBOL(schedule_work_on); + /** * schedule_work - put work task in global workqueue * @work: job to be done @@ -2894,18 +2907,21 @@ int schedule_work(struct work_struct *work) } EXPORT_SYMBOL(schedule_work); -/* - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait * - * This puts a job on a specific cpu + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. */ -int schedule_work_on(int cpu, struct work_struct *work) +int schedule_delayed_work_on(int cpu, + struct delayed_work *dwork, unsigned long delay) { - return queue_work_on(cpu, system_wq, work); + return queue_delayed_work_on(cpu, system_wq, dwork, delay); } -EXPORT_SYMBOL(schedule_work_on); +EXPORT_SYMBOL(schedule_delayed_work_on); /** * schedule_delayed_work - put work task in global workqueue after delay @@ -2922,22 +2938,6 @@ int schedule_delayed_work(struct delayed_work *dwork, } EXPORT_SYMBOL(schedule_delayed_work); -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call -- cgit v1.2.2 From d4283e9378619c14dc3826a6b0527eb5d967ffde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:44 -0700 Subject: workqueue: make queueing functions return bool All queueing functions return 1 on success, 0 if the work item was already pending. Update them to return bool instead. This signifies better that they don't return 0 / -errno. This is cleanup and doesn't cause any functional difference. While at it, fix comment opening for schedule_work_on(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07d309e7e359..70f95ab28f3d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1058,19 +1058,19 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to a specific CPU, the caller must ensure it * can't go away. */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +bool queue_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work) { - int ret = 0; + bool ret = false; if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); - ret = 1; + ret = true; } return ret; } @@ -1081,14 +1081,14 @@ EXPORT_SYMBOL_GPL(queue_work_on); * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) +bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret; + bool ret; ret = queue_work_on(get_cpu(), wq, work); put_cpu(); @@ -1112,14 +1112,14 @@ static void delayed_work_timer_fn(unsigned long __data) * @dwork: work to queue * @delay: number of jiffies to wait before queueing * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) { - int ret = 0; struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + bool ret = false; if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; @@ -1154,7 +1154,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, add_timer_on(timer, cpu); else add_timer(timer); - ret = 1; + ret = true; } return ret; } @@ -1166,9 +1166,9 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. */ -int queue_delayed_work(struct workqueue_struct *wq, +bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { if (delay == 0) @@ -2877,14 +2877,14 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -/* +/** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ -int schedule_work_on(int cpu, struct work_struct *work) +bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } @@ -2894,14 +2894,14 @@ EXPORT_SYMBOL(schedule_work_on); * schedule_work - put work task in global workqueue * @work: job to be done * - * Returns zero if @work was already on the kernel-global workqueue and - * non-zero otherwise. + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. */ -int schedule_work(struct work_struct *work) +bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } @@ -2916,8 +2916,8 @@ EXPORT_SYMBOL(schedule_work); * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) +bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } @@ -2931,8 +2931,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) +bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } -- cgit v1.2.2 From 959d1af8cffc8fd38ed53e8be1cf4ab8782f9c00 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:45 -0700 Subject: workqueue: add missing smp_wmb() in process_one_work() WORK_STRUCT_PENDING is used to claim ownership of a work item and process_one_work() releases it before starting execution. When someone else grabs PENDING, all pre-release updates to the work item should be visible and all updates made by the new owner should happen afterwards. Grabbing PENDING uses test_and_set_bit() and thus has a full barrier; however, clearing doesn't have a matching wmb. Given the preceding spin_unlock and use of clear_bit, I don't believe this can be a problem on an actual machine and there hasn't been any related report but it still is theretically possible for clear_pending to permeate upwards and happen before work->entry update. Add an explicit smp_wmb() before work_clear_pending(). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: stable@vger.kernel.org --- kernel/workqueue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 70f95ab28f3d..5c26d36146b7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1997,7 +1997,9 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); + smp_wmb(); /* paired with test_and_set_bit(PENDING) */ work_clear_pending(work); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); -- cgit v1.2.2 From 8930caba3dbdd8b86dd6934a5920bf61b53a931e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:45 -0700 Subject: workqueue: disable irq while manipulating PENDING Queueing operations use WORK_STRUCT_PENDING_BIT to synchronize access to the target work item. They first try to claim the bit and proceed with queueing only after that succeeds and there's a window between PENDING being set and the actual queueing where the task can be interrupted or preempted. There's also a similar window in process_one_work() when clearing PENDING. A work item is dequeued, gcwq->lock is released and then PENDING is cleared and the worker might get interrupted or preempted between releasing gcwq->lock and clearing PENDING. cancel[_delayed]_work_sync() tries to claim or steal PENDING. The function assumes that a work item with PENDING is either queued or in the process of being [de]queued. In the latter case, it busy-loops until either the work item loses PENDING or is queued. If canceling coincides with the above described interrupts or preemptions, the canceling task will busy-loop while the queueing or executing task is preempted. This patch keeps irq disabled across claiming PENDING and actual queueing and moves PENDING clearing in process_one_work() inside gcwq->lock so that busy looping from PENDING && !queued doesn't wait for interrupted/preempted tasks. Note that, in process_one_work(), setting last CPU and clearing PENDING got merged into single operation. This removes possible long busy-loops and will allow using try_to_grab_pending() from bh and irq contexts. v2: __queue_work() was testing preempt_count() to ensure that the caller has disabled preemption. This triggers spuriously if !CONFIG_PREEMPT_COUNT. Use preemptible() instead. Reported by Fengguang Wu. v3: Disable irq instead of preemption. IRQ will be disabled while grabbing gcwq->lock later anyway and this allows using try_to_grab_pending() from bh and irq contexts. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Fengguang Wu --- kernel/workqueue.c | 73 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5c26d36146b7..30474c4e107c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -537,9 +537,10 @@ static int work_next_color(int color) * work is on queue. Once execution starts, WORK_STRUCT_CWQ is * cleared and the work data contains the cpu number it was last on. * - * set_work_{cwq|cpu}() and clear_work_data() can be used to set the - * cwq, cpu or clear work->data. These functions should only be - * called while the work is owned - ie. while the PENDING bit is set. + * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data() + * can be used to set the cwq, cpu or clear work->data. These functions + * should only be called while the work is owned - ie. while the PENDING + * bit is set. * * get_work_[g]cwq() can be used to obtain the gcwq or cwq * corresponding to a work. gcwq is available once the work has been @@ -561,9 +562,10 @@ static void set_work_cwq(struct work_struct *work, WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); } -static void set_work_cpu(struct work_struct *work, unsigned int cpu) +static void set_work_cpu_and_clear_pending(struct work_struct *work, + unsigned int cpu) { - set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); + set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, 0); } static void clear_work_data(struct work_struct *work) @@ -981,7 +983,14 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; - unsigned long flags; + + /* + * While a work item is PENDING && off queue, a task trying to + * steal the PENDING will busy-loop waiting for it to either get + * queued or lose PENDING. Grabbing PENDING and queueing should + * happen with IRQ disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); debug_work_activate(work); @@ -1008,7 +1017,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { struct worker *worker; - spin_lock_irqsave(&last_gcwq->lock, flags); + spin_lock(&last_gcwq->lock); worker = find_worker_executing_work(last_gcwq, work); @@ -1016,14 +1025,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, gcwq = last_gcwq; else { /* meh... not running there, queue here */ - spin_unlock_irqrestore(&last_gcwq->lock, flags); - spin_lock_irqsave(&gcwq->lock, flags); + spin_unlock(&last_gcwq->lock); + spin_lock(&gcwq->lock); } - } else - spin_lock_irqsave(&gcwq->lock, flags); + } else { + spin_lock(&gcwq->lock); + } } else { gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock_irqsave(&gcwq->lock, flags); + spin_lock(&gcwq->lock); } /* gcwq determined, get cwq and queue */ @@ -1031,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, trace_workqueue_queue_work(cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock(&gcwq->lock); return; } @@ -1049,7 +1059,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, insert_work(cwq, work, worklist, work_flags); - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock(&gcwq->lock); } /** @@ -1067,11 +1077,16 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; + unsigned long flags; + + local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); ret = true; } + + local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(queue_work_on); @@ -1102,7 +1117,9 @@ static void delayed_work_timer_fn(unsigned long __data) struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); + local_irq_disable(); __queue_work(smp_processor_id(), cwq->wq, &dwork->work); + local_irq_enable(); } /** @@ -1120,6 +1137,10 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; bool ret = false; + unsigned long flags; + + /* read the comment in __queue_work() */ + local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; @@ -1156,6 +1177,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, add_timer(timer); ret = true; } + + local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(queue_delayed_work_on); @@ -1970,15 +1993,13 @@ __acquires(&gcwq->lock) return; } - /* claim and process */ + /* claim and dequeue */ debug_work_deactivate(work); hlist_add_head(&worker->hentry, bwh); worker->current_work = work; worker->current_cwq = cwq; work_color = get_work_color(work); - /* record the current cpu number in the work data and dequeue */ - set_work_cpu(work, gcwq->cpu); list_del_init(&work->entry); /* @@ -1995,10 +2016,18 @@ __acquires(&gcwq->lock) if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) wake_up_worker(pool); - spin_unlock_irq(&gcwq->lock); + /* + * Record the last CPU and clear PENDING. The following wmb is + * paired with the implied mb in test_and_set_bit(PENDING) and + * ensures all updates to @work made here are visible to and + * precede any updates by the next PENDING owner. Also, clear + * PENDING inside @gcwq->lock so that PENDING and queued state + * changes happen together while IRQ is disabled. + */ + smp_wmb(); + set_work_cpu_and_clear_pending(work, gcwq->cpu); - smp_wmb(); /* paired with test_and_set_bit(PENDING) */ - work_clear_pending(work); + spin_unlock_irq(&gcwq->lock); lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2836,9 +2865,11 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); */ bool flush_delayed_work(struct delayed_work *dwork) { + local_irq_disable(); if (del_timer_sync(&dwork->timer)) __queue_work(raw_smp_processor_id(), get_work_cwq(&dwork->work)->wq, &dwork->work); + local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); @@ -2857,9 +2888,11 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool flush_delayed_work_sync(struct delayed_work *dwork) { + local_irq_disable(); if (del_timer_sync(&dwork->timer)) __queue_work(raw_smp_processor_id(), get_work_cwq(&dwork->work)->wq, &dwork->work); + local_irq_enable(); return flush_work_sync(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work_sync); -- cgit v1.2.2 From d8e794dfd51c368ed3f686b7f4172830b60ae47b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:45 -0700 Subject: workqueue: set delayed_work->timer function on initialization delayed_work->timer.function is currently initialized during queue_delayed_work_on(). Export delayed_work_timer_fn() and set delayed_work timer function during delayed_work initialization together with other fields. This ensures the timer function is always valid on an initialized delayed_work. This is to help mod_delayed_work() implementation. To detect delayed_work users which diddle with the internal timer, trigger WARN if timer function doesn't match on queue. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 30474c4e107c..55392385fe30 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1112,7 +1112,7 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); @@ -1121,6 +1121,7 @@ static void delayed_work_timer_fn(unsigned long __data) __queue_work(smp_processor_id(), cwq->wq, &dwork->work); local_irq_enable(); } +EXPORT_SYMBOL_GPL(delayed_work_timer_fn); /** * queue_delayed_work_on - queue work on specific CPU after delay @@ -1145,6 +1146,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; + WARN_ON_ONCE(timer->function != delayed_work_timer_fn || + timer->data != (unsigned long)dwork); BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -1168,8 +1171,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, set_work_cwq(work, get_cwq(lcpu, wq), 0); timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; if (unlikely(cpu >= 0)) add_timer_on(timer, cpu); -- cgit v1.2.2 From 57469821fd5c61f25f783827d7334063cff67d65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:45 -0700 Subject: workqueue: unify local CPU queueing handling Queueing functions have been using different methods to determine the local CPU. * queue_work() superflously uses get/put_cpu() to acquire and hold the local CPU across queue_work_on(). * delayed_work_timer_fn() uses smp_processor_id(). * queue_delayed_work() calls queue_delayed_work_on() with -1 @cpu which is interpreted as the local CPU. * flush_delayed_work[_sync]() were using raw_smp_processor_id(). * __queue_work() interprets %WORK_CPU_UNBOUND as local CPU if the target workqueue is bound one but nobody uses this. This patch converts all functions to uniformly use %WORK_CPU_UNBOUND to indicate local CPU and use the local binding feature of __queue_work(). unlikely() is dropped from %WORK_CPU_UNBOUND handling in __queue_work(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 55392385fe30..ce60bb5d12fb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1003,7 +1003,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *last_gcwq; - if (unlikely(cpu == WORK_CPU_UNBOUND)) + if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); /* @@ -1103,12 +1103,7 @@ EXPORT_SYMBOL_GPL(queue_work_on); */ bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { - bool ret; - - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); - - return ret; + return queue_work_on(WORK_CPU_UNBOUND, wq, work); } EXPORT_SYMBOL_GPL(queue_work); @@ -1118,7 +1113,7 @@ void delayed_work_timer_fn(unsigned long __data) struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); local_irq_disable(); - __queue_work(smp_processor_id(), cwq->wq, &dwork->work); + __queue_work(WORK_CPU_UNBOUND, cwq->wq, &dwork->work); local_irq_enable(); } EXPORT_SYMBOL_GPL(delayed_work_timer_fn); @@ -1172,7 +1167,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, timer->expires = jiffies + delay; - if (unlikely(cpu >= 0)) + if (unlikely(cpu != WORK_CPU_UNBOUND)) add_timer_on(timer, cpu); else add_timer(timer); @@ -1198,7 +1193,7 @@ bool queue_delayed_work(struct workqueue_struct *wq, if (delay == 0) return queue_work(wq, &dwork->work); - return queue_delayed_work_on(-1, wq, dwork, delay); + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } EXPORT_SYMBOL_GPL(queue_delayed_work); @@ -2868,7 +2863,7 @@ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), + __queue_work(WORK_CPU_UNBOUND, get_work_cwq(&dwork->work)->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); @@ -2891,7 +2886,7 @@ bool flush_delayed_work_sync(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), + __queue_work(WORK_CPU_UNBOUND, get_work_cwq(&dwork->work)->wq, &dwork->work); local_irq_enable(); return flush_work_sync(&dwork->work); -- cgit v1.2.2 From 715f1300802e6eaefa85f6cfc70ae99af3d5d497 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:46 -0700 Subject: workqueue: fix zero @delay handling of queue_delayed_work_on() If @delay is zero and the dealyed_work is idle, queue_delayed_work() queues it for immediate execution; however, queue_delayed_work_on() lacks this logic and always goes through timer regardless of @delay. This patch moves 0 @delay handling logic from queue_delayed_work() to queue_delayed_work_on() so that both functions behave the same. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ce60bb5d12fb..6cbdc22f8ec7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1125,7 +1125,9 @@ EXPORT_SYMBOL_GPL(delayed_work_timer_fn); * @dwork: work to queue * @delay: number of jiffies to wait before queueing * - * Returns %false if @work was already on a queue, %true otherwise. + * Returns %false if @work was already on a queue, %true otherwise. If + * @delay is zero and @dwork is idle, it will be scheduled for immediate + * execution. */ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) @@ -1135,6 +1137,9 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, bool ret = false; unsigned long flags; + if (!delay) + return queue_work_on(cpu, wq, &dwork->work); + /* read the comment in __queue_work() */ local_irq_save(flags); @@ -1185,14 +1190,11 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * - * Returns %false if @work was already on a queue, %true otherwise. + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - if (delay == 0) - return queue_work(wq, &dwork->work); - return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } EXPORT_SYMBOL_GPL(queue_delayed_work); -- cgit v1.2.2 From bf4ede014ea886b71ef71368738da35b316cb7c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:46 -0700 Subject: workqueue: move try_to_grab_pending() upwards try_to_grab_pending() will be used by to-be-implemented mod_delayed_work[_on](). Move try_to_grab_pending() and related functions above queueing functions. This patch only moves functions around. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 286 ++++++++++++++++++++++++++--------------------------- 1 file changed, 143 insertions(+), 143 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6cbdc22f8ec7..0f50f4078e36 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -903,6 +903,149 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, work); } +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + + trace_workqueue_activate_work(work); + move_linked_works(work, &cwq->pool->worklist, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + cwq->nr_active++; +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * @delayed: for a delayed work + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, + bool delayed) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + + if (!delayed) { + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + } + + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/* + * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ + struct global_cwq *gcwq; + int ret = -1; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + gcwq = get_work_gcwq(work); + if (!gcwq) + return ret; + + spin_lock_irq(&gcwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong gcwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (gcwq == get_work_gcwq(work)) { + debug_work_deactivate(work); + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work), + *work_data_bits(work) & WORK_STRUCT_DELAYED); + ret = 1; + } + } + spin_unlock_irq(&gcwq->lock); + + return ret; +} + /** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to @@ -1831,107 +1974,6 @@ static bool manage_workers(struct worker *worker) return ret; } -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) -{ - struct work_struct *work = list_first_entry(&cwq->delayed_works, - struct work_struct, entry); - - trace_workqueue_activate_work(work); - move_linked_works(work, &cwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; -} - -/** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest - * @color: color of work which left the queue - * @delayed: for a delayed work - * - * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, - bool delayed) -{ - /* ignore uncolored works */ - if (color == WORK_NO_COLOR) - return; - - cwq->nr_in_flight[color]--; - - if (!delayed) { - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } - } - - /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) - return; - - /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) - return; - - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; - - /* - * If this was the last cwq, wake up the first flusher. It - * will handle the rest. - */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); -} - /** * process_one_work - process single work * @worker: self @@ -2767,48 +2809,6 @@ bool flush_work_sync(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work_sync); -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct global_cwq *gcwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - gcwq = get_work_gcwq(work); - if (!gcwq) - return ret; - - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (gcwq == get_work_gcwq(work)) { - debug_work_deactivate(work); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work), - *work_data_bits(work) & WORK_STRUCT_DELAYED); - ret = 1; - } - } - spin_unlock_irq(&gcwq->lock); - - return ret; -} - static bool __cancel_work_timer(struct work_struct *work, struct timer_list* timer) { -- cgit v1.2.2 From b5490077274482efde57a50b060b99bc839acd45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:46 -0700 Subject: workqueue: introduce WORK_OFFQ_FLAG_* Low WORK_STRUCT_FLAG_BITS bits of work_struct->data contain WORK_STRUCT_FLAG_* and flush color. If the work item is queued, the rest point to the cpu_workqueue with WORK_STRUCT_CWQ set; otherwise, WORK_STRUCT_CWQ is clear and the bits contain the last CPU number - either a real CPU number or one of WORK_CPU_*. Scheduled addition of mod_delayed_work[_on]() requires an additional flag, which is used only while a work item is off queue. There are more than enough bits to represent off-queue CPU number on both 32 and 64bits. This patch introduces WORK_OFFQ_FLAG_* which occupy the lower part of the @work->data high bits while off queue. This patch doesn't define any actual OFFQ flag yet. Off-queue CPU number is now shifted by WORK_OFFQ_CPU_SHIFT, which adds the number of bits used by OFFQ flags to WORK_STRUCT_FLAG_SHIFT, to make room for OFFQ flags. To avoid shift width warning with large WORK_OFFQ_FLAG_BITS, ulong cast is added to WORK_STRUCT_NO_CPU and, just in case, BUILD_BUG_ON() to check that there are enough bits to accomodate off-queue CPU number is added. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0f50f4078e36..eeae77079483 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -533,9 +533,9 @@ static int work_next_color(int color) } /* - * A work's data points to the cwq with WORK_STRUCT_CWQ set while the - * work is on queue. Once execution starts, WORK_STRUCT_CWQ is - * cleared and the work data contains the cpu number it was last on. + * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data + * contain the pointer to the queued cwq. Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and CPU number. * * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data() * can be used to set the cwq, cpu or clear work->data. These functions @@ -565,7 +565,7 @@ static void set_work_cwq(struct work_struct *work, static void set_work_cpu_and_clear_pending(struct work_struct *work, unsigned int cpu) { - set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, 0); + set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); } static void clear_work_data(struct work_struct *work) @@ -592,7 +592,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return ((struct cpu_workqueue_struct *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; - cpu = data >> WORK_STRUCT_FLAG_BITS; + cpu = data >> WORK_OFFQ_CPU_SHIFT; if (cpu == WORK_CPU_NONE) return NULL; @@ -3724,6 +3724,10 @@ static int __init init_workqueues(void) unsigned int cpu; int i; + /* make sure we have enough bits for OFFQ CPU number */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < + WORK_CPU_LAST); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); -- cgit v1.2.2 From 7beb2edf44b4dea820c733046ad7666d092bb4b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:46 -0700 Subject: workqueue: factor out __queue_delayed_work() from queue_delayed_work_on() This is to prepare for mod_delayed_work[_on]() and doesn't cause any functional difference. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 74 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eeae77079483..d7f1b7e2bbaa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1261,6 +1261,46 @@ void delayed_work_timer_fn(unsigned long __data) } EXPORT_SYMBOL_GPL(delayed_work_timer_fn); +static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + unsigned int lcpu; + + WARN_ON_ONCE(timer->function != delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + timer_stats_timer_set_start_info(&dwork->timer); + + /* + * This stores cwq for the moment, for the timer_fn. Note that the + * work's gcwq is preserved to allow reentrance detection for + * delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + lcpu = gcwq->cpu; + else + lcpu = raw_smp_processor_id(); + } else { + lcpu = WORK_CPU_UNBOUND; + } + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + + timer->expires = jiffies + delay; + + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); +} + /** * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on @@ -1275,7 +1315,6 @@ EXPORT_SYMBOL_GPL(delayed_work_timer_fn); bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; bool ret = false; unsigned long flags; @@ -1287,38 +1326,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - unsigned int lcpu; - - WARN_ON_ONCE(timer->function != delayed_work_timer_fn || - timer->data != (unsigned long)dwork); - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - - timer_stats_timer_set_start_info(&dwork->timer); - - /* - * This stores cwq for the moment, for the timer_fn. - * Note that the work's gcwq is preserved to allow - * reentrance detection for delayed works. - */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); - - if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) - lcpu = gcwq->cpu; - else - lcpu = raw_smp_processor_id(); - } else - lcpu = WORK_CPU_UNBOUND; - - set_work_cwq(work, get_cwq(lcpu, wq), 0); - - timer->expires = jiffies + delay; - - if (unlikely(cpu != WORK_CPU_UNBOUND)) - add_timer_on(timer, cpu); - else - add_timer(timer); + __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } -- cgit v1.2.2 From 36e227d242f9ec7cb4a8e968561b3b26e3d8b5d1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:46 -0700 Subject: workqueue: reorganize try_to_grab_pending() and __cancel_timer_work() * Use bool @is_dwork instead of @timer and let try_to_grab_pending() use to_delayed_work() to determine the delayed_work address. * Move timer handling from __cancel_work_timer() to try_to_grab_pending(). * Make try_to_grab_pending() use -EAGAIN instead of -1 for busy-looping and drop the ret local variable. * Add proper function comment to try_to_grab_pending(). This makes the code a bit easier to understand and will ease further changes. This patch doesn't make any functional change. v2: Use @is_dwork instead of @timer. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d7f1b7e2bbaa..4b3663b1c677 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1004,15 +1004,33 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, complete(&cwq->wq->first_flusher->done); } -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. +/** + * try_to_grab_pending - steal work item from worklist + * @work: work item to steal + * @is_dwork: @work is a delayed_work + * + * Try to grab PENDING bit of @work. This function can handle @work in any + * stable state - idle, on timer or on worklist. Return values are + * + * 1 if @work was pending and we successfully stole PENDING + * 0 if @work was idle and we claimed PENDING + * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry + * + * On >= 0 return, the caller owns @work's PENDING bit. */ -static int try_to_grab_pending(struct work_struct *work) +static int try_to_grab_pending(struct work_struct *work, bool is_dwork) { struct global_cwq *gcwq; - int ret = -1; + /* try to steal the timer if it exists */ + if (is_dwork) { + struct delayed_work *dwork = to_delayed_work(work); + + if (likely(del_timer(&dwork->timer))) + return 1; + } + + /* try to claim PENDING the normal way */ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; @@ -1022,7 +1040,7 @@ static int try_to_grab_pending(struct work_struct *work) */ gcwq = get_work_gcwq(work); if (!gcwq) - return ret; + return -EAGAIN; spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { @@ -1038,12 +1056,14 @@ static int try_to_grab_pending(struct work_struct *work) cwq_dec_nr_in_flight(get_work_cwq(work), get_work_color(work), *work_data_bits(work) & WORK_STRUCT_DELAYED); - ret = 1; + + spin_unlock_irq(&gcwq->lock); + return 1; } } spin_unlock_irq(&gcwq->lock); - return ret; + return -EAGAIN; } /** @@ -2817,15 +2837,12 @@ bool flush_work_sync(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work_sync); -static bool __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) +static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { int ret; do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); + ret = try_to_grab_pending(work, is_dwork); wait_on_work(work); } while (unlikely(ret < 0)); @@ -2853,7 +2870,7 @@ static bool __cancel_work_timer(struct work_struct *work, */ bool cancel_work_sync(struct work_struct *work) { - return __cancel_work_timer(work, NULL); + return __cancel_work_timer(work, false); } EXPORT_SYMBOL_GPL(cancel_work_sync); @@ -2914,7 +2931,7 @@ EXPORT_SYMBOL(flush_delayed_work_sync); */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { - return __cancel_work_timer(&dwork->work, &dwork->timer); + return __cancel_work_timer(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work_sync); -- cgit v1.2.2 From bbb68dfaba73e8338fe0f1dc711cc1d261daec87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:46 -0700 Subject: workqueue: mark a work item being canceled as such There can be two reasons try_to_grab_pending() can fail with -EAGAIN. One is when someone else is queueing or deqeueing the work item. With the previous patches, it is guaranteed that PENDING and queued state will soon agree making it safe to busy-retry in this case. The other is if multiple __cancel_work_timer() invocations are racing one another. __cancel_work_timer() grabs PENDING and then waits for running instances of the target work item on all CPUs while holding PENDING and !queued. try_to_grab_pending() invoked from another task will keep returning -EAGAIN while the current owner is waiting. Not distinguishing the two cases is okay because __cancel_work_timer() is the only user of try_to_grab_pending() and it invokes wait_on_work() whenever grabbing fails. For the first case, busy looping should be fine but wait_on_work() doesn't cause any critical problem. For the latter case, the new contender usually waits for the same condition as the current owner, so no unnecessarily extended busy-looping happens. Combined, these make __cancel_work_timer() technically correct even without irq protection while grabbing PENDING or distinguishing the two different cases. While the current code is technically correct, not distinguishing the two cases makes it difficult to use try_to_grab_pending() for other purposes than canceling because it's impossible to tell whether it's safe to busy-retry grabbing. This patch adds a mechanism to mark a work item being canceled. try_to_grab_pending() now disables irq on success and returns -EAGAIN to indicate that grabbing failed but PENDING and queued states are gonna agree soon and it's safe to busy-loop. It returns -ENOENT if the work item is being canceled and it may stay PENDING && !queued for arbitrary amount of time. __cancel_work_timer() is modified to mark the work canceling with WORK_OFFQ_CANCELING after grabbing PENDING, thus making try_to_grab_pending() fail with -ENOENT instead of -EAGAIN. Also, it invokes wait_on_work() iff grabbing failed with -ENOENT. This isn't necessary for correctness but makes it consistent with other future users of try_to_grab_pending(). v2: try_to_grab_pending() was testing preempt_count() to ensure that the caller has disabled preemption. This triggers spuriously if !CONFIG_PREEMPT_COUNT. Use preemptible() instead. Reported by Fengguang Wu. v3: Updated so that try_to_grab_pending() disables irq on success rather than requiring preemption disabled by the caller. This makes busy-looping easier and will allow try_to_grap_pending() to be used from bh/irq contexts. Signed-off-by: Tejun Heo Cc: Fengguang Wu --- kernel/workqueue.c | 90 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4b3663b1c677..b4a4e05c89e1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -537,15 +537,20 @@ static int work_next_color(int color) * contain the pointer to the queued cwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and CPU number. * - * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data() - * can be used to set the cwq, cpu or clear work->data. These functions - * should only be called while the work is owned - ie. while the PENDING - * bit is set. + * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the cwq, cpu or clear + * work->data. These functions should only be called while the work is + * owned - ie. while the PENDING bit is set. * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq - * corresponding to a work. gcwq is available once the work has been - * queued anywhere after initialization. cwq is available only from - * queueing until execution starts. + * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to + * a work. gcwq is available once the work has been queued anywhere after + * initialization until it is sync canceled. cwq is available only while + * the work item is queued. + * + * %WORK_OFFQ_CANCELING is used to mark a work item which is being + * canceled. While being canceled, a work item may have its PENDING set + * but stay off timer and worklist for arbitrarily long and nobody should + * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) @@ -600,6 +605,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return get_gcwq(cpu); } +static void mark_work_canceling(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + + set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, + WORK_STRUCT_PENDING); +} + +static bool work_is_canceling(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); +} + /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that @@ -1005,9 +1026,10 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, } /** - * try_to_grab_pending - steal work item from worklist + * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @is_dwork: @work is a delayed_work + * @flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. Return values are @@ -1015,13 +1037,30 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry + * -ENOENT if someone else is canceling @work, this state may persist + * for arbitrarily long * - * On >= 0 return, the caller owns @work's PENDING bit. + * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting + * preempted while holding PENDING and @work off queue, preemption must be + * disabled on entry. This ensures that we don't return -EAGAIN while + * another task is preempted in this function. + * + * On successful return, >= 0, irq is disabled and the caller is + * responsible for releasing it using local_irq_restore(*@flags). + * + * This function is safe to call from any context other than IRQ handler. + * An IRQ handler may run on top of delayed_work_timer_fn() which can make + * this function return -EAGAIN perpetually. */ -static int try_to_grab_pending(struct work_struct *work, bool is_dwork) +static int try_to_grab_pending(struct work_struct *work, bool is_dwork, + unsigned long *flags) { struct global_cwq *gcwq; + WARN_ON_ONCE(in_irq()); + + local_irq_save(*flags); + /* try to steal the timer if it exists */ if (is_dwork) { struct delayed_work *dwork = to_delayed_work(work); @@ -1040,9 +1079,9 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork) */ gcwq = get_work_gcwq(work); if (!gcwq) - return -EAGAIN; + goto fail; - spin_lock_irq(&gcwq->lock); + spin_lock(&gcwq->lock); if (!list_empty(&work->entry)) { /* * This work is queued, but perhaps we locked the wrong gcwq. @@ -1057,12 +1096,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork) get_work_color(work), *work_data_bits(work) & WORK_STRUCT_DELAYED); - spin_unlock_irq(&gcwq->lock); + spin_unlock(&gcwq->lock); return 1; } } - spin_unlock_irq(&gcwq->lock); - + spin_unlock(&gcwq->lock); +fail: + local_irq_restore(*flags); + if (work_is_canceling(work)) + return -ENOENT; + cpu_relax(); return -EAGAIN; } @@ -2839,13 +2882,24 @@ EXPORT_SYMBOL_GPL(flush_work_sync); static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + unsigned long flags; int ret; do { - ret = try_to_grab_pending(work, is_dwork); - wait_on_work(work); + ret = try_to_grab_pending(work, is_dwork, &flags); + /* + * If someone else is canceling, wait for the same event it + * would be waiting for before retrying. + */ + if (unlikely(ret == -ENOENT)) + wait_on_work(work); } while (unlikely(ret < 0)); + /* tell other tasks trying to grab @work to back off */ + mark_work_canceling(work); + local_irq_restore(flags); + + wait_on_work(work); clear_work_data(work); return ret; } -- cgit v1.2.2 From 8376fe22c7e79c7e90857d39f82aeae6cad6c4b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:47 -0700 Subject: workqueue: implement mod_delayed_work[_on]() Workqueue was lacking a mechanism to modify the timeout of an already pending delayed_work. delayed_work users have been working around this using several methods - using an explicit timer + work item, messing directly with delayed_work->timer, and canceling before re-queueing, all of which are error-prone and/or ugly. This patch implements mod_delayed_work[_on]() which behaves similarly to mod_timer() - if the delayed_work is idle, it's queued with the given delay; otherwise, its timeout is modified to the new value. Zero @delay guarantees immediate execution. v2: Updated to reflect try_to_grab_pending() changes. Now safe to be called from bh context. Signed-off-by: Tejun Heo Cc: Linus Torvalds Cc: Andrew Morton Cc: Ingo Molnar --- kernel/workqueue.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b4a4e05c89e1..41ae2c0979fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1413,6 +1413,59 @@ bool queue_delayed_work(struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work); +/** + * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is + * zero, @work is guaranteed to be scheduled immediately regardless of its + * current state. + * + * Returns %false if @dwork was idle and queued, %true if @dwork was + * pending and its timer was modified. + * + * This function is safe to call from any context other than IRQ handler. + * See try_to_grab_pending() for details. + */ +bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (likely(ret >= 0)) { + __queue_delayed_work(cpu, wq, dwork, delay); + local_irq_restore(flags); + } + + /* -ENOENT from try_to_grab_pending() becomes %true */ + return ret; +} +EXPORT_SYMBOL_GPL(mod_delayed_work_on); + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(mod_delayed_work); + /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state -- cgit v1.2.2 From a181dc14ed23f7a499542ff4c78532b5f24bb18f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:29 +0300 Subject: jump_label: Export jump_label_rate_limit() CC: Jason Baron CC: Ingo Molnar CC: Peter Zijlstra Signed-off-by: Gleb Natapov Acked-by: Jason Baron Signed-off-by: Avi Kivity --- kernel/jump_label.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5ec..60f48fa0fd0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key, key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } +EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { -- cgit v1.2.2 From 3c18c10bde65b6dcaffab7a4d040285e4defa49b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 31 Jul 2012 10:23:37 -0400 Subject: tracing: Fix wakeup_rt self test on virtual machines The warkeup_rt self test used msleep() calls to wait for real time tasks to wake up and run. On bare-metal hardware, this was enough as the scheduler should let the RT task run way before the non-RT task wakes up from the msleep(). If it did not, then that would mean the scheduler was broken. But when dealing with virtual machines, this is a different story. If the RT task wakes up on a VCPU, it's up to the host to decide when that task gets to schedule, which can be far behind the time that the non-RT task wakes up. In this case, the test would fail incorrectly. As we are not testing the scheduler, but instead the wake up tracing, we can use completions to wait and not depend on scheduler timings to see if events happen on time. Link: http://lkml.kernel.org/r/1343663105.3847.7.camel@fedora Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 1003a4d5eb25..2c00a691a540 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1041,6 +1041,8 @@ static int trace_wakeup_test_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); schedule(); + complete(x); + /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* @@ -1084,24 +1086,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tracing_max_latency = 0; - /* sleep to let the RT thread sleep too */ - msleep(100); + while (p->on_rq) { + /* + * Sleep to make sure the RT thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ + init_completion(&isrt); wake_up_process(p); - /* give a little time to let the thread wake up */ - msleep(100); + /* Wait for the task to wake up */ + wait_for_completion(&isrt); /* stop the tracing. */ tracing_stop(); -- cgit v1.2.2 From 92d8d4a8b0f4c6eba70f6e62b48e38bd005a56e6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 19 Jun 2012 17:47:52 +0200 Subject: tracing/filter: Add missing initialization Add missing initialization for ret variable. Its initialization is based on the re_cnt variable, which is being set deep down in the ftrace_function_filter_re function. I'm not sure compilers would be smart enough to see this in near future, so killing the warning this way. Link: http://lkml.kernel.org/r/1340120894-9465-2-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 431dba8b7542..c154797a7ff7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, static int __ftrace_function_set_filter(int filter, char *buf, int len, struct function_filter_data *data) { - int i, re_cnt, ret; + int i, re_cnt, ret = -EINVAL; int *reset; char **re; -- cgit v1.2.2 From 87abb3b15c62033409f5bf2ffb5620c94f91cf2c Mon Sep 17 00:00:00 2001 From: Wang Tianhong Date: Thu, 2 Aug 2012 14:02:00 +0800 Subject: tracing/trivial: Fix some typos in kernel/trace Fix some typos in kernel/trace. Link: http://lkml.kernel.org/r/1343887320.2228.9.camel@louis-ThinkPad-T410 Signed-off-by: Wang Tianhong Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- kernel/trace/trace.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 49491fa7daa2..b32ed0e385a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as - * it works like an on/off switch, where as the disable() verison + * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct ring_buffer *buffer) @@ -2839,7 +2839,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as - * it works like an on/off switch, where as the enable() verison + * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct ring_buffer *buffer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a120f98c4112..d1a8d07ec866 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -426,15 +426,15 @@ __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { - unsigned long threshhold; + unsigned long threshold; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &threshhold); + ret = strict_strtoul(str, 0, &threshold); if (ret < 0) return 0; - tracing_thresh = threshhold * 1000; + tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); -- cgit v1.2.2 From 4018994f3d8785275ef0e7391b75c3462c029e56 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:37 +0200 Subject: perf: Add ability to attach user level registers dump to sample Introducing PERF_SAMPLE_REGS_USER sample type bit to trigger the dump of user level registers on sample. Registers we want to dump are specified by sample_regs_user bitmask. Only user level registers are dumped at the moment. Meaning the register values of the user space context as it was before the user entered the kernel for whatever reason (syscall, irq, exception, or a PMI happening in userspace). The layout of the sample_regs_user bitmap is described in asm/perf_regs.h for archs that support register dump. This is going to be useful to bring Dwarf CFI based stack unwinding on top of samples. Original-patch-by: Frederic Weisbecker [ Dump registers ABI specification. ] Signed-off-by: Jiri Olsa Suggested-by: Stephane Eranian Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..d3ce97525b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3756,6 +3756,37 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +static void +perf_output_sample_regs(struct perf_output_handle *handle, + struct pt_regs *regs, u64 mask) +{ + int bit; + + for_each_set_bit(bit, (const unsigned long *) &mask, + sizeof(mask) * BITS_PER_BYTE) { + u64 val; + + val = perf_reg_value(regs, bit); + perf_output_put(handle, val); + } +} + +static void perf_sample_regs_user(struct perf_regs_user *regs_user, + struct pt_regs *regs) +{ + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + regs_user->regs = regs; + regs_user->abi = perf_reg_abi(current); + } +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4016,6 +4047,23 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, nr); } } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + u64 abi = data->regs_user.abi; + + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_user; + perf_output_sample_regs(handle, + data->regs_user.regs, + mask); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -4067,6 +4115,20 @@ void perf_prepare_sample(struct perf_event_header *header, } header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_user(&data->regs_user, regs); + + if (data->regs_user.regs) { + u64 mask = event->attr.sample_regs_user; + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -6142,6 +6204,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } } + + if (attr->sample_type & PERF_SAMPLE_REGS_USER) + ret = perf_reg_validate(attr->sample_regs_user); + out: return ret; -- cgit v1.2.2 From 91d7753a45f8525dc75b6be01e427dc1c378dc16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Aug 2012 15:20:38 +0200 Subject: perf: Factor __output_copy to be usable with specific copy function Adding a generic way to use __output_copy function with specific copy function via DEFINE_PERF_OUTPUT_COPY macro. Using this to add new __output_copy_user function, that provides output copy from user pointers. For x86 the copy_from_user_nmi function is used and __copy_from_user_inatomic for the rest of the architectures. This new function will be used in user stack dump on sample, coming in next patches. Signed-off-by: Jiri Olsa Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-4-git-send-email-jolsa@redhat.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/internal.h | 62 ++++++++++++++++++++++++++++++--------------- kernel/events/ring_buffer.c | 4 +-- 2 files changed, 43 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index a096c19f2c2a..7fd5408493d2 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -2,6 +2,7 @@ #define _KERNEL_EVENTS_INTERNAL_H #include +#include /* Buffer handling */ @@ -76,30 +77,49 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } -static inline void -__output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned int \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned int len) \ +{ \ + unsigned long size, written; \ + \ + do { \ + size = min_t(unsigned long, handle->size, len); \ + \ + written = memcpy_func(handle->addr, buf, size); \ + \ + len -= written; \ + handle->addr += written; \ + buf += written; \ + handle->size -= written; \ + if (!handle->size) { \ + struct ring_buffer *rb = handle->rb; \ + \ + handle->page++; \ + handle->page &= rb->nr_pages - 1; \ + handle->addr = rb->data_pages[handle->page]; \ + handle->size = PAGE_SIZE << page_order(rb); \ + } \ + } while (len && written == size); \ + \ + return len; \ +} + +static inline int memcpy_common(void *dst, const void *src, size_t n) { - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct ring_buffer *rb = handle->rb; - - handle->page++; - handle->page &= rb->nr_pages - 1; - handle->addr = rb->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(rb); - } - } while (len); + memcpy(dst, src, n); + return n; } +DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) + +#ifndef arch_perf_out_copy_user +#define arch_perf_out_copy_user __copy_from_user_inatomic +#endif + +DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) + /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ddaba43fb7a..b4c2ad3dee7a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -182,10 +182,10 @@ out: return -ENOSPC; } -void perf_output_copy(struct perf_output_handle *handle, +unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { - __output_copy(handle, buf, len); + return __output_copy(handle, buf, len); } void perf_output_end(struct perf_output_handle *handle) -- cgit v1.2.2 From 5685e0ff45f5df67e79e9b052b6ffd501ff38c11 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:39 +0200 Subject: perf: Add perf_output_skip function to skip bytes in sample Introducing perf_output_skip function to be able to skip data within the perf ring buffer. When writing data into perf ring buffer we first reserve needed place in ring buffer and then copy the actual data. There's a possibility we won't be able to fill all the reserved size with data, so we need a way to skip the remaining bytes. This is going to be useful when storing the user stack dump, where we might end up with less data than we originally requested. Signed-off-by: Jiri Olsa Acked-by: Frederic Weisbecker Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-5-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/internal.h | 4 ++++ kernel/events/ring_buffer.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 7fd5408493d2..ce7bdfc1d045 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -114,6 +114,10 @@ static inline int memcpy_common(void *dst, const void *src, size_t n) DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) +#define MEMCPY_SKIP(dst, src, n) (n) + +DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) + #ifndef arch_perf_out_copy_user #define arch_perf_out_copy_user __copy_from_user_inatomic #endif diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b4c2ad3dee7a..23cb34ff3973 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -188,6 +188,12 @@ unsigned int perf_output_copy(struct perf_output_handle *handle, return __output_copy(handle, buf, len); } +unsigned int perf_output_skip(struct perf_output_handle *handle, + unsigned int len) +{ + return __output_skip(handle, NULL, len); +} + void perf_output_end(struct perf_output_handle *handle) { perf_output_put_handle(handle); -- cgit v1.2.2 From c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:40 +0200 Subject: perf: Add ability to attach user stack dump to sample Introducing PERF_SAMPLE_STACK_USER sample type bit to trigger the dump of the user level stack on sample. The size of the dump is specified by sample_stack_user value. Being able to dump parts of the user stack, starting from the stack pointer, will be useful to make a post mortem dwarf CFI based stack unwinding. Added HAVE_PERF_USER_STACK_DUMP config option to determine if the architecture provides user stack dump on perf event samples. This needs access to the user stack pointer which is not unified across architectures. Enabling this for x86 architecture. Signed-off-by: Jiri Olsa Original-patch-by: Frederic Weisbecker Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-6-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/events/internal.h | 16 +++++ 2 files changed, 165 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d3ce97525b9f..2ba890450d15 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" @@ -3787,6 +3788,101 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user, } } +/* + * Get remaining task size from user stack pointer. + * + * It'd be better to take stack vma map and limit this more + * precisly, but there's no way to get it safely under interrupt, + * so using TASK_SIZE as limit. + */ +static u64 perf_ustack_task_size(struct pt_regs *regs) +{ + unsigned long addr = perf_user_stack_pointer(regs); + + if (!addr || addr >= TASK_SIZE) + return 0; + + return TASK_SIZE - addr; +} + +static u16 +perf_sample_ustack_size(u16 stack_size, u16 header_size, + struct pt_regs *regs) +{ + u64 task_size; + + /* No regs, no stack pointer, no dump. */ + if (!regs) + return 0; + + /* + * Check if we fit in with the requested stack size into the: + * - TASK_SIZE + * If we don't, we limit the size to the TASK_SIZE. + * + * - remaining sample size + * If we don't, we customize the stack size to + * fit in to the remaining sample size. + */ + + task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); + stack_size = min(stack_size, (u16) task_size); + + /* Current header size plus static size and dynamic size. */ + header_size += 2 * sizeof(u64); + + /* Do we fit in with the current stack dump size? */ + if ((u16) (header_size + stack_size) < header_size) { + /* + * If we overflow the maximum size for the sample, + * we customize the stack dump size to fit in. + */ + stack_size = USHRT_MAX - header_size - sizeof(u64); + stack_size = round_up(stack_size, sizeof(u64)); + } + + return stack_size; +} + +static void +perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, + struct pt_regs *regs) +{ + /* Case of a kernel thread, nothing to dump */ + if (!regs) { + u64 size = 0; + perf_output_put(handle, size); + } else { + unsigned long sp; + unsigned int rem; + u64 dyn_size; + + /* + * We dump: + * static size + * - the size requested by user or the best one we can fit + * in to the sample max size + * data + * - user stack dump data + * dynamic size + * - the actual dumped size + */ + + /* Static size. */ + perf_output_put(handle, dump_size); + + /* Data. */ + sp = perf_user_stack_pointer(regs); + rem = __output_copy_user(handle, (void *) sp, dump_size); + dyn_size = dump_size - rem; + + perf_output_skip(handle, rem); + + /* Dynamic size. */ + perf_output_put(handle, dyn_size); + } +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4064,6 +4160,11 @@ void perf_output_sample(struct perf_output_handle *handle, mask); } } + + if (sample_type & PERF_SAMPLE_STACK_USER) + perf_output_sample_ustack(handle, + data->stack_user_size, + data->regs_user.regs); } void perf_prepare_sample(struct perf_event_header *header, @@ -4129,6 +4230,35 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_STACK_USER) { + /* + * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * processed as the last one or have additional check added + * in case new sample type is added, because we could eat + * up the rest of the sample size. + */ + struct perf_regs_user *uregs = &data->regs_user; + u16 stack_size = event->attr.sample_stack_user; + u16 size = sizeof(u64); + + if (!uregs->abi) + perf_sample_regs_user(uregs, regs); + + stack_size = perf_sample_ustack_size(stack_size, header->size, + uregs->regs); + + /* + * If there is something to dump, add space for the dump + * itself and for the field that tells the dynamic size, + * which is how many have been actually dumped. + */ + if (stack_size) + size += sizeof(u64) + stack_size; + + data->stack_user_size = stack_size; + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -6205,8 +6335,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } } - if (attr->sample_type & PERF_SAMPLE_REGS_USER) + if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); + if (ret) + return ret; + } + + if (attr->sample_type & PERF_SAMPLE_STACK_USER) { + if (!arch_perf_have_user_stack_dump()) + return -ENOSYS; + + /* + * We have __u32 type for the size, but so far + * we can only use __u16 as maximum due to the + * __u16 sample size limit. + */ + if (attr->sample_stack_user >= USHRT_MAX) + ret = -EINVAL; + else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) + ret = -EINVAL; + } out: return ret; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ce7bdfc1d045..d56a64c99a8b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -158,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx) recursion[rctx]--; } +#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP +static inline bool arch_perf_have_user_stack_dump(void) +{ + return true; +} + +#define perf_user_stack_pointer(regs) user_stack_pointer(regs) +#else +static inline bool arch_perf_have_user_stack_dump(void) +{ + return false; +} + +#define perf_user_stack_pointer(regs) 0 +#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ + #endif /* _KERNEL_EVENTS_INTERNAL_H */ -- cgit v1.2.2 From d077526485d5c9b12fe85d0b2b3b7041e6bc5f91 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Aug 2012 15:20:41 +0200 Subject: perf: Add attribute to filter out callchains Introducing following bits to the the perf_event_attr struct: - exclude_callchain_kernel to filter out kernel callchain from the sample dump - exclude_callchain_user to filter out user callchain from the sample dump We need to be able to disable standard user callchain dump when we use the dwarf cfi callchain mode, because frame pointer based user callchains are useless in this mode. Implementing also exclude_callchain_kernel to have complete set of options. Signed-off-by: Jiri Olsa [ Added kernel callchains filtering ] Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-7-git-send-email-jolsa@redhat.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 98d4597f43d6..c77206184b8b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) int rctx; struct perf_callchain_entry *entry; + int kernel = !event->attr.exclude_callchain_kernel; + int user = !event->attr.exclude_callchain_user; + + if (!kernel && !user) + return NULL; entry = get_callchain_entry(&rctx); if (rctx == -1) @@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) entry->nr = 0; - if (!user_mode(regs)) { + if (kernel && !user_mode(regs)) { perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; } - if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) - goto exit_put; - - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + if (user) { + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } } exit_put: -- cgit v1.2.2 From 5d01bbd111d6ff9ea9d9847774f66dff39633776 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 10:42:35 +0000 Subject: rcu: Yield simpler The rcu_yield() code is amazing. It's there to avoid starvation of the system when lots of (boosting) work is to be done. Now looking at the code it's functionality is: Make the thread SCHED_OTHER and very nice, i.e. get it out of the way Arm a timer with 2 ticks schedule() Now if the system goes idle the rcu task returns, regains SCHED_FIFO and plugs on. If the systems stays busy the timer fires and wakes a per node kthread which in turn makes the per cpu thread SCHED_FIFO and brings it back on the cpu. For the boosting thread the "make it FIFO" bit is missing and it just runs some magic boost checks. Now this is a lot of code with extra threads and complexity. It's way simpler to let the tasks when they detect overload schedule away for 2 ticks and defer the normal wakeup as long as they are in yielded state and the cpu is not idle. That solves the same problem and the only difference is that when the cpu goes idle it's not guaranteed that the thread returns right away, but it won't be longer out than two ticks, so no harm is done. If that's an issue than it is way simpler just to wake the task from idle as RCU has callbacks there anyway. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: Rusty Russell Cc: Namhyung Kim Reviewed-by: Paul E. McKenney Link: http://lkml.kernel.org/r/20120716103948.131256723@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/rcutree.c | 8 +- kernel/rcutree.h | 7 +- kernel/rcutree_plugin.h | 210 +++++++++--------------------------------------- 3 files changed, 41 insertions(+), 184 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..f08ee3bc5741 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -139,7 +139,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -1469,7 +1469,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -2594,11 +2594,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); rcu_cpu_kthread_setrt(cpu, 1); break; case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); + rcu_boost_kthread_setaffinity(rnp, cpu); rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..f08176172546 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -491,13 +491,8 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index); -static void invoke_rcu_node_kthread(struct rcu_node *rnp); -static void rcu_yield(void (*f)(unsigned long), unsigned long arg); + struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df01..0f8b5ec64a7d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1069,6 +1069,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1140,17 +1150,6 @@ static int rcu_boost(struct rcu_node *rnp) ACCESS_ONCE(rnp->boost_tasks) != NULL; } -/* - * Timer handler to initiate waking up of boost kthreads that - * have yielded the CPU due to excessive numbers of tasks to - * boost. We wake up the per-rcu_node kthread, which in turn - * will wake up the booster kthread. - */ -static void rcu_boost_kthread_timer(unsigned long arg) -{ - invoke_rcu_node_kthread((struct rcu_node *)arg); -} - /* * Priority-boosting kthread. One per leaf rcu_node and one for the * root rcu_node. @@ -1174,8 +1173,9 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End boost kthread@rcu_yield"); - rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } @@ -1213,8 +1213,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore(&rnp->lock, flags); t = rnp->boost_kthread_task; - if (t != NULL) - wake_up_process(t); + if (t) + rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1231,8 +1231,10 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } local_irq_restore(flags); } @@ -1245,21 +1247,6 @@ static bool rcu_is_callbacks_kthread(void) return __get_cpu_var(rcu_cpu_kthread_task) == current; } -/* - * Set the affinity of the boost kthread. The CPU-hotplug locks are - * held, so no one should be messing with the existence of the boost - * kthread. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) -{ - struct task_struct *t; - - t = rnp->boost_kthread_task; - if (t != NULL) - set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1276,15 +1263,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) + struct rcu_node *rnp) { + int rnp_index = rnp - &rsp->node[0]; unsigned long flags; struct sched_param sp; struct task_struct *t; if (&rcu_preempt_state != rsp) return 0; + + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; @@ -1327,20 +1318,6 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) -{ - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - /* * Set the specified CPU's kthread to run RT or not, as specified by * the to_rt argument. The CPU-hotplug locks are held, so the task @@ -1365,45 +1342,6 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt) sched_setscheduler_nocheck(t, policy, &sp); } -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); -} - -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) -{ - struct sched_param sp; - struct timer_list yield_timer; - int prio = current->rt_priority; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - set_user_nice(current, 0); - sp.sched_priority = prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); -} - /* * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. * This can happen while the corresponding CPU is either coming online @@ -1476,7 +1414,7 @@ static int rcu_cpu_kthread(void *arg) if (spincnt > 10) { *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End CPU kthread@rcu_yield"); - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start CPU kthread@rcu_yield"); spincnt = 0; } @@ -1532,48 +1470,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) return 0; } -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) -{ - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); - } - } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; - return 0; -} - /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1583,17 +1479,17 @@ static int rcu_node_kthread(void *arg) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { + struct task_struct *t = rnp->boost_kthread_task; + unsigned long mask = rnp->qsmaskinit; cpumask_var_t cm; int cpu; - unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL) + if (!t) return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - cpumask_clear(cm); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); @@ -1603,50 +1499,17 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(cpu, cm); WARN_ON_ONCE(cpumask_weight(cm) == 0); } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); + set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); -} - /* * Spawn all kthreads -- called as soon as the scheduler is running. */ static int __init rcu_spawn_kthreads(void) { - int cpu; struct rcu_node *rnp; + int cpu; rcu_scheduler_fully_active = 1; for_each_possible_cpu(cpu) { @@ -1655,10 +1518,10 @@ static int __init rcu_spawn_kthreads(void) (void)rcu_spawn_one_cpu_kthread(cpu); } rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } return 0; } @@ -1672,8 +1535,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) { (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } } @@ -1706,7 +1568,7 @@ static void rcu_stop_cpu_kthread(int cpu) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -- cgit v1.2.2 From 2a1d446019f9a5983ec5a335b95e8593fdb6fa2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 10:42:36 +0000 Subject: kthread: Implement park/unpark facility To avoid the full teardown/setup of per cpu kthreads in the case of cpu hot(un)plug, provide a facility which allows to put the kthread into a park position and unpark it when the cpu comes online again. Signed-off-by: Thomas Gleixner Reviewed-by: Namhyung Kim Cc: Peter Zijlstra Reviewed-by: Srivatsa S. Bhat Cc: Rusty Russell Reviewed-by: Paul E. McKenney Link: http://lkml.kernel.org/r/20120716103948.236618824@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/kthread.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 166 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index b579af57ea10..146a6fa96825 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -37,11 +37,20 @@ struct kthread_create_info }; struct kthread { - int should_stop; + unsigned long flags; + unsigned int cpu; void *data; + struct completion parked; struct completion exited; }; +enum KTHREAD_BITS { + KTHREAD_IS_PER_CPU = 0, + KTHREAD_SHOULD_STOP, + KTHREAD_SHOULD_PARK, + KTHREAD_IS_PARKED, +}; + #define to_kthread(tsk) \ container_of((tsk)->vfork_done, struct kthread, exited) @@ -52,12 +61,28 @@ struct kthread { * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ -int kthread_should_stop(void) +bool kthread_should_stop(void) { - return to_kthread(current)->should_stop; + return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); +/** + * kthread_should_park - should this kthread park now? + * + * When someone calls kthread_park() on your kthread, it will be woken + * and this will return true. You should then do the necessary + * cleanup and call kthread_parkme() + * + * Similar to kthread_should_stop(), but this keeps the thread alive + * and in a park position. kthread_unpark() "restarts" the thread and + * calls the thread function again. + */ +bool kthread_should_park(void) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); +} + /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen @@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +static void __kthread_parkme(struct kthread *self) +{ + __set_current_state(TASK_INTERRUPTIBLE); + while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { + if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) + complete(&self->parked); + schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + } + clear_bit(KTHREAD_IS_PARKED, &self->flags); + __set_current_state(TASK_RUNNING); +} + +void kthread_parkme(void) +{ + __kthread_parkme(to_kthread(current)); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -105,9 +148,10 @@ static int kthread(void *_create) struct kthread self; int ret; - self.should_stop = 0; + self.flags = 0; self.data = data; init_completion(&self.exited); + init_completion(&self.parked); current->vfork_done = &self.exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -117,9 +161,11 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - if (!self.should_stop) - ret = threadfn(data); + if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { + __kthread_parkme(&self); + ret = threadfn(data); + } /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create) * Returns a task_struct or ERR_PTR(-ENOMEM). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, - int node, + void *data, int node, const char namefmt[], ...) { @@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); +static void __kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); + p->flags |= PF_THREAD_BOUND; +} + /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). @@ -226,13 +278,111 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) WARN_ON(1); return; } - - /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + __kthread_bind(p, cpu); } EXPORT_SYMBOL(kthread_bind); +/** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: This helper function creates and names a kernel thread + * The thread will be woken and put into park mode. + */ +struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + void *data, unsigned int cpu, + const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + cpu); + if (IS_ERR(p)) + return p; + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; + /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ + kthread_park(p); + return p; +} + +static struct kthread *task_get_live_kthread(struct task_struct *k) +{ + struct kthread *kthread; + + get_task_struct(k); + kthread = to_kthread(k); + /* It might have exited */ + barrier(); + if (k->vfork_done != NULL) + return kthread; + return NULL; +} + +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + + if (kthread) { + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu); + wake_up_process(k); + } + } + put_task_struct(k); +} + +/** + * kthread_park - park a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return true, wakes it, and + * waits for it to return. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will park without + * calling threadfn(). + * + * Returns 0 if the thread is parked, -ENOSYS if the thread exited. + * If called by the kthread itself just the park bit is set. + */ +int kthread_park(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + int ret = -ENOSYS; + + if (kthread) { + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); + } + } + ret = 0; + } + put_task_struct(k); + return ret; +} + /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). @@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread; + struct kthread *kthread = task_get_live_kthread(k); int ret; trace_sched_kthread_stop(k); - get_task_struct(k); - - kthread = to_kthread(k); - barrier(); /* it might have exited */ - if (k->vfork_done != NULL) { - kthread->should_stop = 1; + if (kthread) { + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_process(k); wait_for_completion(&kthread->exited); } -- cgit v1.2.2 From f97f8f06a49febbc3cb3635172efbe64ddc79700 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 10:42:36 +0000 Subject: smpboot: Provide infrastructure for percpu hotplug threads Provide a generic interface for setting up and tearing down percpu threads. On registration the threads for already online cpus are created and started. On deregistration (modules) the threads are stoppped. During hotplug operations the threads are created, started, parked and unparked. The datastructure for registration provides a pointer to percpu storage space and optional setup, cleanup, park, unpark functions. These functions are called when the thread state changes. Each implementation has to provide a function which is queried and returns whether the thread should run and the thread function itself. The core code handles all state transitions and avoids duplicated code in the call sites. [ paulmck: Preemption leak fix ] Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Srivatsa S. Bhat Cc: Rusty Russell Reviewed-by: Paul E. McKenney Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20120716103948.352501068@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 10 ++- kernel/smpboot.c | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/smpboot.h | 4 + 3 files changed, 242 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 14d32588cccd..e615dfbcf794 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } + smpboot_park_threads(cpu); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ + smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_release; } BUG_ON(cpu_online(cpu)); @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + ret = smpboot_create_threads(cpu); + if (ret) + goto out; + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; @@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + /* Wake the per cpu threads */ + smpboot_unpark_threads(cpu); + /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1b..9d5f7b04025d 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,11 +1,17 @@ /* * Common SMP CPU bringup/teardown functions */ +#include #include #include #include +#include +#include #include +#include #include +#include +#include #include "smpboot.h" @@ -65,3 +71,226 @@ void __init idle_threads_init(void) } } #endif + +static LIST_HEAD(hotplug_threads); +static DEFINE_MUTEX(smpboot_threads_lock); + +struct smpboot_thread_data { + unsigned int cpu; + unsigned int status; + struct smp_hotplug_thread *ht; +}; + +enum { + HP_THREAD_NONE = 0, + HP_THREAD_ACTIVE, + HP_THREAD_PARKED, +}; + +/** + * smpboot_thread_fn - percpu hotplug thread loop function + * @data: thread data pointer + * + * Checks for thread stop and park conditions. Calls the necessary + * setup, cleanup, park and unpark functions for the registered + * thread. + * + * Returns 1 when the thread should exit, 0 otherwise. + */ +static int smpboot_thread_fn(void *data) +{ + struct smpboot_thread_data *td = data; + struct smp_hotplug_thread *ht = td->ht; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + preempt_disable(); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->cleanup) + ht->cleanup(td->cpu, cpu_online(td->cpu)); + kfree(td); + return 0; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->park && td->status == HP_THREAD_ACTIVE) { + BUG_ON(td->cpu != smp_processor_id()); + ht->park(td->cpu); + td->status = HP_THREAD_PARKED; + } + kthread_parkme(); + /* We might have been woken for stop */ + continue; + } + + BUG_ON(td->cpu != smp_processor_id()); + + /* Check for state change setup */ + switch (td->status) { + case HP_THREAD_NONE: + preempt_enable(); + if (ht->setup) + ht->setup(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + case HP_THREAD_PARKED: + preempt_enable(); + if (ht->unpark) + ht->unpark(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + } + + if (!ht->thread_should_run(td->cpu)) { + preempt_enable(); + schedule(); + } else { + set_current_state(TASK_RUNNING); + preempt_enable(); + ht->thread_fn(td->cpu); + } + } +} + +static int +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + struct smpboot_thread_data *td; + + if (tsk) + return 0; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); + if (!td) + return -ENOMEM; + td->cpu = cpu; + td->ht = ht; + + tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, + ht->thread_comm); + if (IS_ERR(tsk)) { + kfree(td); + return PTR_ERR(tsk); + } + + get_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = tsk; + return 0; +} + +int smpboot_create_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) { + ret = __smpboot_create_thread(cur, cpu); + if (ret) + break; + } + mutex_unlock(&smpboot_threads_lock); + return ret; +} + +static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + kthread_unpark(tsk); +} + +void smpboot_unpark_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) + smpboot_unpark_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) + kthread_park(tsk); +} + +void smpboot_park_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry_reverse(cur, &hotplug_threads, list) + smpboot_park_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) +{ + unsigned int cpu; + + /* We need to destroy also the parked threads of offline cpus */ + for_each_possible_cpu(cpu) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) { + kthread_stop(tsk); + put_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = NULL; + } + } +} + +/** + * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Creates and starts the threads on all online cpus. + */ +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + unsigned int cpu; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + for_each_online_cpu(cpu) { + ret = __smpboot_create_thread(plug_thread, cpu); + if (ret) { + smpboot_destroy_threads(plug_thread); + goto out; + } + smpboot_unpark_thread(plug_thread, cpu); + } + list_add(&plug_thread->list, &hotplug_threads); +out: + mutex_unlock(&smpboot_threads_lock); + return ret; +} +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); + +/** + * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Stops all threads on all possible cpus. + */ +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + list_del(&plug_thread->list); + smpboot_destroy_threads(plug_thread); + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6ef9433e1c70..72415a0eb955 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { } static inline void idle_threads_init(void) { } #endif +int smpboot_create_threads(unsigned int cpu); +void smpboot_park_threads(unsigned int cpu); +void smpboot_unpark_threads(unsigned int cpu); + #endif -- cgit v1.2.2 From 3180d89b47701072cf129f800a735baf3acdbb8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jul 2012 01:55:54 -0700 Subject: hotplug: Fix UP bug in smpboot hotplug code Because kernel subsystems need their per-CPU kthreads on UP systems as well as on SMP systems, the smpboot hotplug kthread functions must be provided in UP builds as well as in SMP builds. This commit therefore adds smpboot.c to UP builds and excludes irrelevant code via #ifdef. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner --- kernel/Makefile | 3 +-- kernel/smpboot.c | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..e5602d32acb3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o lglock.o + async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 9d5f7b04025d..d6c5fc054242 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -15,6 +15,8 @@ #include "smpboot.h" +#ifdef CONFIG_SMP + #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -72,6 +74,8 @@ void __init idle_threads_init(void) } #endif +#endif /* #ifdef CONFIG_SMP */ + static LIST_HEAD(hotplug_threads); static DEFINE_MUTEX(smpboot_threads_lock); -- cgit v1.2.2 From 3e339b5dae24a7065e196eb8d0145ab2f8cc2d2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 10:42:37 +0000 Subject: softirq: Use hotplug thread infrastructure [ paulmck: Call rcu_note_context_switch() with interrupts enabled. ] Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: Rusty Russell Reviewed-by: Paul E. McKenney Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20120716103948.456416747@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 111 ++++++++++++++----------------------------------------- 1 file changed, 27 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09e..5c6a5bd8462f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -742,49 +743,22 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int run_ksoftirqd(void * __bind_cpu) +static int ksoftirqd_should_run(unsigned int cpu) { - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - preempt_disable(); - if (!local_softirq_pending()) { - schedule_preempt_disabled(); - } - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - sched_preempt_enable_no_resched(); - cond_resched(); - preempt_disable(); - rcu_note_context_switch((long)__bind_cpu); - } - preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; + return local_softirq_pending(); +} -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); +static void run_ksoftirqd(unsigned int cpu) +{ + local_irq_disable(); + if (local_softirq_pending()) { + __do_softirq(); + rcu_note_context_switch(cpu); + local_irq_enable(); + cond_resched(); + return; } - __set_current_state(TASK_RUNNING); - return 0; + local_irq_enable(); } #ifdef CONFIG_HOTPLUG_CPU @@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create_on_node(run_ksoftirqd, - hcpu, - cpu_to_node(hotcpu), - "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return notifier_from_errno(PTR_ERR(p)); - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); case CPU_DEAD: - case CPU_DEAD_FROZEN: { - static const struct sched_param param = { - .sched_priority = MAX_RT_PRIO-1 - }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); - takeover_tasklets(hotcpu); + case CPU_DEAD_FROZEN: + takeover_tasklets((unsigned long)hcpu); break; - } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; +static struct smp_hotplug_thread softirq_threads = { + .store = &ksoftirqd, + .thread_should_run = ksoftirqd_should_run, + .thread_fn = run_ksoftirqd, + .thread_comm = "ksoftirqd/%u", +}; + static __init int spawn_ksoftirqd(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err != NOTIFY_OK); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); + + BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); + return 0; } early_initcall(spawn_ksoftirqd); -- cgit v1.2.2 From bcd951cf10f24e341defcd002c15a1f4eea13ddb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 10:42:38 +0000 Subject: watchdog: Use hotplug thread infrastructure Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: Rusty Russell Reviewed-by: Paul E. McKenney Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20120716103948.563736676@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/watchdog.c | 263 ++++++++++++++++++------------------------------------ 1 file changed, 89 insertions(+), 174 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..9d4c8d5a1f53 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -29,16 +30,18 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_disabled; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif @@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -#else -static inline void watchdog_interrupt_count(void) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +static int watchdog_nmi_enable(unsigned int cpu); +static void watchdog_nmi_disable(unsigned int cpu); /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) @@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } +static void watchdog_set_prio(unsigned int policy, unsigned int prio) +{ + struct sched_param param = { .sched_priority = prio }; -/* - * The watchdog thread - touches the timestamp. - */ -static int watchdog(void *unused) + sched_setscheduler(current, policy, ¶m); +} + +static void watchdog_enable(unsigned int cpu) { - struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - /* initialize timestamp */ - __touch_watchdog(); + if (!watchdog_enabled) { + kthread_park(current); + return; + } + + /* Enable the perf event */ + watchdog_nmi_enable(cpu); /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), HRTIMER_MODE_REL_PINNED); - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly (kicked by the hrtimer callback function) once every - * get_sample_period() seconds (4 seconds by default) to reset the - * softlockup timestamp. If this gets delayed for more than - * 2*watchdog_thresh seconds then the debug-printout triggers in - * watchdog_timer_fn(). - */ - while (!kthread_should_stop()) { - __touch_watchdog(); - schedule(); + /* initialize timestamp */ + watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); + __touch_watchdog(); +} - if (kthread_should_stop()) - break; +static void watchdog_disable(unsigned int cpu) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - set_current_state(TASK_INTERRUPTIBLE); - } - /* - * Drop the policy/priority elevation during thread exit to avoid a - * scheduling latency spike. - */ - __set_current_state(TASK_RUNNING); - sched_setscheduler(current, SCHED_NORMAL, ¶m); - return 0; + watchdog_set_prio(SCHED_NORMAL, 0); + hrtimer_cancel(hrtimer); + /* disable the perf event */ + watchdog_nmi_disable(cpu); } +static int watchdog_should_run(unsigned int cpu) +{ + return __this_cpu_read(hrtimer_interrupts) != + __this_cpu_read(soft_lockup_hrtimer_cnt); +} + +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every get_sample_period() seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static void watchdog(unsigned int cpu) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); +} #ifdef CONFIG_HARDLOCKUP_DETECTOR /* @@ -379,7 +403,7 @@ static int watchdog(void *unused) */ static unsigned long cpu0_err; -static int watchdog_nmi_enable(int cpu) +static int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -433,7 +457,7 @@ out: return 0; } -static void watchdog_nmi_disable(int cpu) +static void watchdog_nmi_disable(unsigned int cpu) { struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu) return; } #else -static int watchdog_nmi_enable(int cpu) { return 0; } -static void watchdog_nmi_disable(int cpu) { return; } +static int watchdog_nmi_enable(unsigned int cpu) { return 0; } +static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static void watchdog_prepare_cpu(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - WARN_ON(per_cpu(softlockup_watchdog, cpu)); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; -} - -static int watchdog_enable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err = 0; - - /* enable the perf event */ - err = watchdog_nmi_enable(cpu); - - /* Regardless of err above, fall through and start softlockup */ - - /* create the watchdog thread */ - if (!p) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); - if (IS_ERR(p)) { - pr_err("softlockup watchdog for %i failed\n", cpu); - if (!err) { - /* if hardlockup hasn't already set this */ - err = PTR_ERR(p); - /* and disable the perf event */ - watchdog_nmi_disable(cpu); - } - goto out; - } - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - per_cpu(watchdog_touch_ts, cpu) = 0; - per_cpu(softlockup_watchdog, cpu) = p; - wake_up_process(p); - } - -out: - return err; -} - -static void watchdog_disable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - /* - * cancel the timer first to stop incrementing the stats - * and waking up the kthread - */ - hrtimer_cancel(hrtimer); - - /* disable the perf event */ - watchdog_nmi_disable(cpu); - - /* stop the watchdog thread */ - if (p) { - per_cpu(softlockup_watchdog, cpu) = NULL; - kthread_stop(p); - } -} - /* sysctl functions */ #ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { - int cpu; - - watchdog_enabled = 0; - - for_each_online_cpu(cpu) - if (!watchdog_enable(cpu)) - /* if any cpu succeeds, watchdog is considered - enabled for the system */ - watchdog_enabled = 1; - - if (!watchdog_enabled) - pr_err("failed to be enabled on some cpus\n"); + unsigned int cpu; + if (watchdog_disabled) { + watchdog_disabled = 0; + for_each_online_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } } static void watchdog_disable_all_cpus(void) { - int cpu; - - for_each_online_cpu(cpu) - watchdog_disable(cpu); + unsigned int cpu; - /* if all watchdogs are disabled, then they are disabled for the system */ - watchdog_enabled = 0; + if (!watchdog_disabled) { + watchdog_disabled = 1; + for_each_online_cpu(cpu) + kthread_park(per_cpu(softlockup_watchdog, cpu)); + } } - /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ @@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write, { int ret; + if (watchdog_disabled < 0) + return -ENODEV; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) - goto out; + return ret; if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); -out: return ret; } #endif /* CONFIG_SYSCTL */ - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - watchdog_prepare_cpu(hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (watchdog_enabled) - watchdog_enable(hotcpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - watchdog_disable(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - watchdog_disable(hotcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - - /* - * hardlockup and softlockup are not important enough - * to block cpu bring up. Just always succeed and - * rely on printk output to flag problems. - */ - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .park = watchdog_disable, + .unpark = watchdog_enable, }; void __init lockup_detector_init(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(notifier_to_errno(err)); - - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return; + if (smpboot_register_percpu_thread(&watchdog_threads)) { + pr_err("Failed to create watchdog threads, disabled\n"); + watchdog_disabled = -ENODEV; + } } -- cgit v1.2.2 From 62ab7072476ae1600e877cc62b43758e485f4f1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Jul 2012 10:42:38 +0000 Subject: rcu: Use smp_hotplug_thread facility for RCUs per-CPU kthread Bring RCU into the new-age CPU-hotplug fold by modifying RCU's per-CPU kthread code to use the new smp_hotplug_thread facility. [ tglx: Adapted it to use callbacks and to the simplified rcu yield ] Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: Rusty Russell Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20120716103948.673354828@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/rcutree.c | 4 - kernel/rcutree.h | 8 -- kernel/rcutree_plugin.h | 203 ++++++++++-------------------------------------- kernel/rcutree_trace.c | 3 +- 4 files changed, 41 insertions(+), 177 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f08ee3bc5741..11a4fdca1df7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -133,7 +133,6 @@ static int rcu_scheduler_fully_active __read_mostly; */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); @@ -1468,7 +1467,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ - rcu_stop_cpu_kthread(cpu); rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -2595,11 +2593,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_ONLINE: case CPU_DOWN_FAILED: rcu_boost_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); break; case CPU_DOWN_PREPARE: rcu_boost_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: case CPU_DYING_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index f08176172546..1224d4c05382 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -196,12 +196,6 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ - struct task_struct *node_kthread_task; - /* kthread that takes care of this rcu_node */ - /* structure, for example, awakening the */ - /* per-CPU kthreads as needed. */ - unsigned int node_kthread_status; - /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* @@ -468,7 +462,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); -static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); @@ -494,7 +487,6 @@ static void rcu_preempt_do_callbacks(void); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0f8b5ec64a7d..c1961aed1213 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include +#include #define RCU_KTHREAD_PRIO 1 @@ -1292,25 +1293,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Stop the RCU's per-CPU kthread when its CPU goes offline,. - */ -static void rcu_stop_cpu_kthread(int cpu) -{ - struct task_struct *t; - - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1318,59 +1300,22 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +static void rcu_cpu_kthread_setup(unsigned int cpu) { - int policy; struct sched_param sp; - struct task_struct *t; - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) +static void rcu_cpu_kthread_park(unsigned int cpu) { - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __get_cpu_var(rcu_cpu_has_work); } /* @@ -1378,96 +1323,35 @@ static int rcu_cpu_kthread_should_stop(int cpu) * RCU softirq used in flavors and configurations of RCU that do not * support RCU priority boosting. */ -static int rcu_cpu_kthread(void *arg) +static void rcu_cpu_kthread(unsigned int cpu) { - int cpu = (int)(long)arg; - unsigned long flags; - int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + int spincnt; - trace_rcu_utilization("Start CPU kthread@init"); - for (;;) { - *statusp = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End CPU kthread@rcu_wait"); - rcu_wait(*workp != 0 || kthread_should_stop()); + for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; - local_irq_save(flags); + this_cpu_inc(rcu_cpu_kthread_loops); + local_irq_disable(); work = *workp; *workp = 0; - local_irq_restore(flags); + local_irq_enable(); if (work) rcu_kthread_do_work(); local_bh_enable(); - if (*workp != 0) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End CPU kthread@rcu_yield"); - schedule_timeout_interruptible(2); - trace_rcu_utilization("Start CPU kthread@rcu_yield"); - spincnt = 0; + if (*workp == 0) { + trace_rcu_utilization("End CPU kthread@rcu_wait"); + *statusp = RCU_KTHREAD_WAITING; + return; } } - *statusp = RCU_KTHREAD_STOPPED; - trace_rcu_utilization("End CPU kthread@term"); - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - * - * Please note that we cannot simply refuse to wake up the per-CPU - * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, - * which can result in softlockup complaints if the task ends up being - * idle for more than a couple of minutes. - * - * However, please note also that we cannot bind the per-CPU kthread to its - * CPU until that CPU is fully online. We also cannot wait until the - * CPU is fully online before we create its per-CPU kthread, as this would - * deadlock the system when CPU notifiers tried waiting for grace - * periods. So we bind the per-CPU kthread to its CPU only if the CPU - * is online. If its CPU is not yet fully online, then the code in - * rcu_cpu_kthread() will wait until it is fully online, and then do - * the binding. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create_on_node(rcu_cpu_kthread, - (void *)(long)cpu, - cpu_to_node(cpu), - "rcuc/%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - if (cpu_online(cpu)) - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ - return 0; + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("Start CPU kthread@rcu_yield"); + schedule_timeout_interruptible(2); + trace_rcu_utilization("End CPU kthread@rcu_yield"); + *statusp = RCU_KTHREAD_WAITING; } /* @@ -1503,6 +1387,15 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + /* * Spawn all kthreads -- called as soon as the scheduler is running. */ @@ -1512,11 +1405,9 @@ static int __init rcu_spawn_kthreads(void) int cpu; rcu_scheduler_fully_active = 1; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) - (void)rcu_spawn_one_cpu_kthread(cpu); - } + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); rnp = rcu_get_root(rcu_state); (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { @@ -1533,10 +1424,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) { - (void)rcu_spawn_one_cpu_kthread(cpu); + if (rcu_scheduler_fully_active) (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); - } } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1560,22 +1449,10 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_cpu_kthread(int cpu) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ -} - static int __init rcu_scheduler_really_started(void) { rcu_scheduler_fully_active = 1; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94e..31968931f146 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -108,11 +108,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c/%d ktl=%x", + seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), - per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); -- cgit v1.2.2 From 532b1858c5241bedfff5ab863d7cf012e8b81a6b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 8 Aug 2012 16:16:04 +0200 Subject: sched: Fix __sched_period comment It should be sched_nr_latency so fix it before it annoys me more. Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344435364-18632-1-git-send-email-bp@amd64.org Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704c..99285a85e210 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) /* * The idea is to set a period in which each task runs once. * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * When there are too many tasks (sched_nr_latency) we have to stretch * this period because otherwise the slices get too small. * * p = (nr <= nl) ? l : l*nr/nl -- cgit v1.2.2 From edde96eafc91a510f404e7b82cfc0ecb608505ee Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 4 Aug 2012 11:49:47 +0300 Subject: sched: Document schedule() entry points This patch adds a comment on top of the schedule() function to explain to scheduler newbies how the main scheduler function is entered. Acked-by: Randy Dunlap Explained-by: Ingo Molnar Explained-by: Peter Zijlstra Signed-off-by: Pekka Enberg Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344070187-2420-1-git-send-email-penberg@kernel.org Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..c9a3655e572d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3367,6 +3367,40 @@ pick_next_task(struct rq *rq) /* * __schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space */ static void __sched __schedule(void) { -- cgit v1.2.2 From 78feefc512a09165627dd534111f651b6c8e605f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 6 Aug 2012 16:41:59 +0800 Subject: sched: using dst_rq instead of this_rq during load balance As we already have dst_rq in lb_env, using or changing "this_rq" do not make sense. This patch will replace "this_rq" with dst_rq in load_balance, and we don't need to change "this_rq" while process LBF_SOME_PINNED any more. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/501F8357.3070102@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 99285a85e210..287bfaca6420 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4283,7 +4283,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == this_rq); + BUG_ON(busiest == env.dst_rq); schedstat_add(sd, lb_imbalance[idle], env.imbalance); @@ -4304,7 +4304,7 @@ redo: update_h_load(env.src_cpu); more_balance: local_irq_save(flags); - double_rq_lock(this_rq, busiest); + double_rq_lock(env.dst_rq, busiest); /* * cur_ld_moved - load moved in current iteration @@ -4312,7 +4312,7 @@ more_balance: */ cur_ld_moved = move_tasks(&env); ld_moved += cur_ld_moved; - double_rq_unlock(this_rq, busiest); + double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { @@ -4348,8 +4348,7 @@ more_balance: if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && lb_iterations++ < max_lb_iterations) { - this_rq = cpu_rq(env.new_dst_cpu); - env.dst_rq = this_rq; + env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; -- cgit v1.2.2 From f03542a7019c600163ac4441d8a826c92c1bd510 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 26 Jul 2012 08:55:34 +0800 Subject: sched: recover SD_WAKE_AFFINE in select_task_rq_fair and code clean up Since power saving code was removed from sched now, the implement code is out of service in this function, and even pollute other logical. like, 'want_sd' never has chance to be set '0', that remove the effect of SD_WAKE_AFFINE here. So, clean up the obsolete code, includes SD_PREFER_LOCAL. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5028F431.6000306@intel.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 1 - kernel/sched/fair.c | 34 +++------------------------------- 2 files changed, 3 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c9a3655e572d..4376c9f34790 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6622,7 +6622,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 0*SD_WAKE_AFFINE - | 0*SD_PREFER_LOCAL | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 287bfaca6420..01d3eda6b7f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2686,7 +2686,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int want_sd = 1; int sync = wake_flags & WF_SYNC; if (p->nr_cpus_allowed == 1) @@ -2703,27 +2702,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (!(tmp->flags & SD_LOAD_BALANCE)) continue; - /* - * If power savings logic is enabled for a domain, see if we - * are not overloaded, if so, don't balance wider. - */ - if (tmp->flags & (SD_PREFER_LOCAL)) { - unsigned long power = 0; - unsigned long nr_running = 0; - unsigned long capacity; - int i; - - for_each_cpu(i, sched_domain_span(tmp)) { - power += power_of(i); - nr_running += cpu_rq(i)->cfs.nr_running; - } - - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - - if (nr_running < capacity) - want_sd = 0; - } - /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -2731,21 +2709,15 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { affine_sd = tmp; - want_affine = 0; - } - - if (!want_sd && !want_affine) break; + } - if (!(tmp->flags & sd_flag)) - continue; - - if (want_sd) + if (tmp->flags & sd_flag) sd = tmp; } if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) prev_cpu = cpu; new_cpu = select_idle_sibling(p, prev_cpu); -- cgit v1.2.2 From 1265057fa02c7bed3b6d9ddc8a2048065a370364 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Aug 2012 09:38:42 -0700 Subject: workqueue: fix CPU binding of flush_delayed_work[_sync]() delayed_work encodes the workqueue to use and the last CPU in delayed_work->work.data while it's on timer. The target CPU is implicitly recorded as the CPU the timer is queued on and delayed_work_timer_fn() queues delayed_work->work to the CPU it is running on. Unfortunately, this leaves flush_delayed_work[_sync]() no way to find out which CPU the delayed_work was queued for when they try to re-queue after killing the timer. Currently, it chooses the local CPU flush is running on. This can unexpectedly move a delayed_work queued on a specific CPU to another CPU and lead to subtle errors. There isn't much point in trying to save several bytes in struct delayed_work, which is already close to a hundred bytes on 64bit with all debug options turned off. This patch adds delayed_work->cpu to remember the CPU it's queued for. Note that if the timer is migrated during CPU down, the work item could be queued to the downed global_cwq after this change. As a detached global_cwq behaves like an unbound one, this doesn't change much for the delayed_work. Signed-off-by: Tejun Heo Cc: Linus Torvalds Cc: Ingo Molnar Cc: Andrew Morton --- kernel/workqueue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41ae2c0979fe..11723c5b2b20 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1319,7 +1319,7 @@ void delayed_work_timer_fn(unsigned long __data) struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); local_irq_disable(); - __queue_work(WORK_CPU_UNBOUND, cwq->wq, &dwork->work); + __queue_work(dwork->cpu, cwq->wq, &dwork->work); local_irq_enable(); } EXPORT_SYMBOL_GPL(delayed_work_timer_fn); @@ -1356,6 +1356,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, set_work_cwq(work, get_cwq(lcpu, wq), 0); + dwork->cpu = cpu; timer->expires = jiffies + delay; if (unlikely(cpu != WORK_CPU_UNBOUND)) @@ -2997,7 +2998,7 @@ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(WORK_CPU_UNBOUND, + __queue_work(dwork->cpu, get_work_cwq(&dwork->work)->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); @@ -3020,7 +3021,7 @@ bool flush_delayed_work_sync(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(WORK_CPU_UNBOUND, + __queue_work(dwork->cpu, get_work_cwq(&dwork->work)->wq, &dwork->work); local_irq_enable(); return flush_work_sync(&dwork->work); -- cgit v1.2.2 From 23657bb192f14b789e4c478def8f11ecc95b4f6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Aug 2012 17:08:19 -0700 Subject: workqueue: add missing wmb() in clear_work_data() Any operation which clears PENDING should be preceded by a wmb to guarantee that the next PENDING owner sees all the changes made before PENDING release. There are only two places where PENDING is cleared - set_work_cpu_and_clear_pending() and clear_work_data(). The caller of the former already does smp_wmb() but the latter doesn't have any. Move the wmb above set_work_cpu_and_clear_pending() into it and add one to clear_work_data(). There hasn't been any report related to this issue, and, given how clear_work_data() is used, it is extremely unlikely to have caused any actual problems on any architecture. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/workqueue.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 11723c5b2b20..4fef9527a620 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -570,11 +570,19 @@ static void set_work_cwq(struct work_struct *work, static void set_work_cpu_and_clear_pending(struct work_struct *work, unsigned int cpu) { + /* + * The following wmb is paired with the implied mb in + * test_and_set_bit(PENDING) and ensures all updates to @work made + * here are visible to and precede any updates by the next PENDING + * owner. + */ + smp_wmb(); set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); } static void clear_work_data(struct work_struct *work) { + smp_wmb(); /* see set_work_cpu_and_clear_pending() */ set_work_data(work, WORK_STRUCT_NO_CPU, 0); } @@ -2182,14 +2190,11 @@ __acquires(&gcwq->lock) wake_up_worker(pool); /* - * Record the last CPU and clear PENDING. The following wmb is - * paired with the implied mb in test_and_set_bit(PENDING) and - * ensures all updates to @work made here are visible to and - * precede any updates by the next PENDING owner. Also, clear - * PENDING inside @gcwq->lock so that PENDING and queued state - * changes happen together while IRQ is disabled. + * Record the last CPU and clear PENDING which should be the last + * update to @work. Also, do this inside @gcwq->lock so that + * PENDING and queued state changes happen together while IRQ is + * disabled. */ - smp_wmb(); set_work_cpu_and_clear_pending(work, gcwq->cpu); spin_unlock_irq(&gcwq->lock); -- cgit v1.2.2 From 4f82f45730c68fdaf9b0472495a965188404866e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 May 2012 10:37:59 -0600 Subject: net ip6 flowlabel: Make owner a union of struct pid * and kuid_t Correct a long standing omission and use struct pid in the owner field of struct ip6_flowlabel when the share type is IPV6_FL_S_PROCESS. This guarantees we don't have issues when pid wraparound occurs. Use a kuid_t in the owner field of struct ip6_flowlabel when the share type is IPV6_FL_S_USER to add user namespace support. In /proc/net/ip6_flowlabel capture the current pid namespace when opening the file and release the pid namespace when the file is closed ensuring we print the pid owner value that is meaning to the reader of the file. Similarly use from_kuid_munged to print uid values that are meaningful to the reader of the file. This requires exporting pid_nr_ns so that ipv6 can continue to built as a module. Yoiks what silliness Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index e86b291ad834..aebd4f5aaf41 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) } return nr; } +EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { -- cgit v1.2.2 From 523a6a945f3cf5f1d337e50634687a577a732a5f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Aug 2012 19:11:22 -0700 Subject: pidns: Export free_pid_ns There is a least one modular user so export free_pid_ns so modules can capture and use the pid namespace on the very rare occasion when it makes sense. Acked-by: David S. Miller Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b3c7fd554250..baa528d7dfbd 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -16,6 +16,7 @@ #include #include #include +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -144,6 +145,7 @@ void free_pid_ns(struct kref *kref) if (parent != NULL) put_pid_ns(parent); } +EXPORT_SYMBOL_GPL(free_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { -- cgit v1.2.2 From 330dad5b9c9555632578c00e94e85c122561c5c7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 15 Aug 2012 23:25:36 +0900 Subject: workqueue: use enum value to set array size of pools in gcwq Commit 3270476a6c0ce322354df8679652f060d66526dc ('workqueue: reimplement WQ_HIGHPRI using a separate worker_pool') introduce separate worker_pool for HIGHPRI. Although there is NR_WORKER_POOLS enum value which represent size of pools, definition of worker_pool in gcwq doesn't use it. Using it makes code robust and prevent future mistakes. So change code to use this enum value. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4fef9527a620..49d8f4a0110d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -183,7 +183,8 @@ struct global_cwq { struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct worker_pool pools[2]; /* normal and highpri pools */ + struct worker_pool pools[NR_WORKER_POOLS]; + /* normal and highpri pools */ wait_queue_head_t rebind_hold; /* rebind hold wait */ } ____cacheline_aligned_in_smp; -- cgit v1.2.2 From b75cac9368fa91636e17d0f7950b35d837154e14 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 15 Aug 2012 23:25:37 +0900 Subject: workqueue: correct req_cpu in trace_workqueue_queue_work() When we do tracing workqueue_queue_work(), it records requested cpu. But, if !(@wq->flag & WQ_UNBOUND) and @cpu is WORK_CPU_UNBOUND, requested cpu is changed as local cpu. In case of @wq->flag & WQ_UNBOUND, above change is not occured, therefore it is reasonable to correct it. Use temporary local variable for storing requested cpu. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 49d8f4a0110d..c29f2dc0f4fc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1198,6 +1198,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; + unsigned int req_cpu = cpu; /* * While a work item is PENDING && off queue, a task trying to @@ -1253,7 +1254,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(cpu, cwq, work); + trace_workqueue_queue_work(req_cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { spin_unlock(&gcwq->lock); -- cgit v1.2.2 From e42986de481238204f6e0b0f4434da428895c20b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 15 Aug 2012 23:25:38 +0900 Subject: workqueue: change value of lcpu in __queue_delayed_work_on() We assign cpu id into work struct's data field in __queue_delayed_work_on(). In current implementation, when work is come in first time, current running cpu id is assigned. If we do __queue_delayed_work_on() with CPU A on CPU B, __queue_work() invoked in delayed_work_timer_fn() go into the following sub-optimal path in case of WQ_NON_REENTRANT. gcwq = get_gcwq(cpu); if (wq->flags & WQ_NON_REENTRANT && (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { Change lcpu to @cpu and rechange lcpu to local cpu if lcpu is WORK_CPU_UNBOUND. It is sufficient to prevent to go into sub-optimal path. tj: Slightly rephrased the comment. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c29f2dc0f4fc..99ee9b939264 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1356,9 +1356,15 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *gcwq = get_work_gcwq(work); - if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + /* + * If we cannot get the last gcwq from @work directly, + * select the last CPU such that it avoids unnecessarily + * triggering non-reentrancy check in __queue_work(). + */ + lcpu = cpu; + if (gcwq) lcpu = gcwq->cpu; - else + if (lcpu == WORK_CPU_UNBOUND) lcpu = raw_smp_processor_id(); } else { lcpu = WORK_CPU_UNBOUND; -- cgit v1.2.2 From 1aabe902ca3638d862bf0dad5a697d3a8e046b0a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 15 Aug 2012 23:25:39 +0900 Subject: workqueue: introduce system_highpri_wq Commit 3270476a6c0ce322354df8679652f060d66526dc ('workqueue: reimplement WQ_HIGHPRI using a separate worker_pool') introduce separate worker pool for HIGHPRI. When we handle busyworkers for gcwq, it can be normal worker or highpri worker. But, we don't consider this difference in rebind_workers(), we use just system_wq for highpri worker. It makes mismatch between cwq->pool and worker->pool. It doesn't make error in current implementation, but possible in the future. Now, we introduce system_highpri_wq to use proper cwq for highpri workers in rebind_workers(). Following patch fix this issue properly. tj: Even apart from rebinding, having system_highpri_wq generally makes sense. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 99ee9b939264..329c404b68c2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -269,12 +269,14 @@ struct workqueue_struct { }; struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_highpri_wq __read_mostly; struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; struct workqueue_struct *system_freezable_wq __read_mostly; struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL_GPL(system_highpri_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); @@ -3928,6 +3930,7 @@ static int __init init_workqueues(void) } system_wq = alloc_workqueue("events", 0, 0); + system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, @@ -3936,9 +3939,9 @@ static int __init init_workqueues(void) WQ_FREEZABLE, 0); system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", WQ_NON_REENTRANT | WQ_FREEZABLE, 0); - BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq || - !system_nrt_freezable_wq); + BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || + !system_nrt_wq || !system_unbound_wq || !system_freezable_wq || + !system_nrt_freezable_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.2 From e2b6a6d570f070aa90ac00d2d10b1488512f8520 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 15 Aug 2012 23:25:40 +0900 Subject: workqueue: use system_highpri_wq for highpri workers in rebind_workers() In rebind_workers(), we do inserting a work to rebind to cpu for busy workers. Currently, in this case, we use only system_wq. This makes a possible error situation as there is mismatch between cwq->pool and worker->pool. To prevent this, we should use system_highpri_wq for highpri worker to match theses. This implements it. tj: Rephrased comment a bit. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 329c404b68c2..8936761b814a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1741,6 +1741,7 @@ retry: /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + struct workqueue_struct *wq; /* morph UNBOUND to REBIND */ worker->flags &= ~WORKER_UNBOUND; @@ -1750,11 +1751,20 @@ retry: work_data_bits(rebind_work))) continue; - /* wq doesn't matter, use the default one */ debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); + + /* + * wq doesn't really matter but let's keep @worker->pool + * and @cwq->pool consistent for sanity. + */ + if (worker_pool_pri(worker->pool)) + wq = system_highpri_wq; + else + wq = system_wq; + + insert_work(get_cwq(gcwq->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } } -- cgit v1.2.2 From 7635d2fd7f0fa63b6ec03050614c314d7139f14a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 15 Aug 2012 23:25:41 +0900 Subject: workqueue: use system_highpri_wq for unbind_work To speed cpu down processing up, use system_highpri_wq. As scheduling priority of workers on it is higher than system_wq and it is not contended by other normal works on this cpu, work on it is processed faster than system_wq. tj: CPU up/downs care quite a bit about latency these days. This shouldn't hurt anything and makes sense. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8936761b814a..7da24711038f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3680,7 +3680,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: /* unbinding should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); - schedule_work_on(cpu, &unbind_work); + queue_work_on(cpu, system_highpri_wq, &unbind_work); flush_work(&unbind_work); break; } -- cgit v1.2.2 From 73fbec604432e1fbfeb1bc59a110dac1f98160f6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Jun 2012 15:57:37 +0200 Subject: sched: Move cputime code to its own file Extract cputime code from the giant sched/core.c and put it in its own file. This make it easier to deal with this particular area and de-bloat a bit more core.c Signed-off-by: Frederic Weisbecker Acked-by: Martin Schwidefsky Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 557 +------------------------------------------------ kernel/sched/cputime.c | 504 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 63 ++++++ 4 files changed, 570 insertions(+), 556 deletions(-) create mode 100644 kernel/sched/cputime.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4376c9f34790..ae3bcaa3afbf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static void update_rq_clock_task(struct rq *rq, s64 delta) { /* @@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -2809,404 +2652,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif - /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. - * - */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif -} - - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - int index; - - /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += cputime; - - /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) -{ - /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - int index; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; - else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; - else - index = CPUTIME_SYSTEM; - - __account_system_time(p, cputime, cputime_scaled, index); -} - -/* - * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64) cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; -} - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; - - account_steal_time(st); - return st; - } -#endif - return false; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to - * @user_tick: is the tick from userspace - * @rq: the pointer to rq - * - * Tick demultiplexing follows the order - * - pending hardirq update - * - pending softirq update - * - user_time - * - idle_time - * - system time - * - check for guest_time - * - else account as system_time - * - * Check for hardirq is done both for system and user time as there is - * no timer going off while we are on hardirq and hence we may never get an - * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq - * softirq as those do not count in task exec_runtime any more. - */ -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (steal_account_process_tick()) - return; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; - } else if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - * Also, p->stime needs to be updated for ksoftirqd. - */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); - } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); - } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); - } -} - -static void irqtime_account_idle_ticks(int ticks) -{ - int i; - struct rq *rq = this_rq(); - - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) -{ - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); - else - temp = div64_u64(temp, (__force u64) total); - - return (__force cputime_t) temp; -} - -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, utime = p->utime, total = utime + p->stime; - - /* - * Use CFS's precise accounting: - */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - - if (total) - utime = scale_utime(utime, rtime, total); - else - utime = rtime; - - /* - * Compare with previous values, to keep monotonicity: - */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); - - *ut = p->prev_utime; - *st = p->prev_stime; -} - -/* - * Must be called with siglock held. - */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct signal_struct *sig = p->signal; - struct task_cputime cputime; - cputime_t rtime, utime, total; - - thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; -} -#endif - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -8419,6 +7864,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ +struct cpuacct root_cpuacct; + /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..372692bd5376 --- /dev/null +++ b/kernel/sched/cputime.c @@ -0,0 +1,504 @@ +#include +#include +#include +#include +#include +#include "sched.h" + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +DEFINE_PER_CPU(u64, cpu_hardirq_time); +DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +DEFINE_PER_CPU(seqcount_t, irq_time_seq); +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static int irqtime_account_hi_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; +#endif + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += tmp; + ca = parent_ca(ca); + } + rcu_read_unlock(); +#endif +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += cputime; + + /* Add guest time to cpustat. */ + if (TASK_NICE(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) +{ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + int index; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + index = CPUTIME_IRQ; + else if (in_serving_softirq()) + index = CPUTIME_SOFTIRQ; + else + index = CPUTIME_SYSTEM; + + __account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + + if (steal_account_process_tick()) + return; + + if (irqtime_account_hi_update()) { + cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + } else if (irqtime_account_si_update()) { + cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SOFTIRQ); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SYSTEM); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + + if (steal_account_process_tick()) + return; + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, utime = p->utime, total = utime + p->stime; + + /* + * Use CFS's precise accounting: + */ + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + + if (total) + utime = scale_utime(utime, rtime, total); + else + utime = rtime; + + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + + *ut = p->prev_utime; + *st = p->prev_stime; +} + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime.utime + cputime.stime; + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e77..804c2e5e7872 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -891,6 +891,9 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +extern struct cgroup_subsys cpuacct_subsys; +extern struct cpuacct root_cpuacct; + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { @@ -917,6 +920,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; @@ -1157,3 +1170,53 @@ enum rq_nohz_flag_bits { #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + -- cgit v1.2.2 From baa36046d09ea6dbc122c795566992318663d9eb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 18 Jun 2012 17:54:14 +0200 Subject: cputime: Consolidate vtime handling on context switch The archs that implement virtual cputime accounting all flush the cputime of a task when it gets descheduled and sometimes set up some ground initialization for the next task to account its cputime. These archs all put their own hooks in their context switch callbacks and handle the off-case themselves. Consolidate this by creating a new account_switch_vtime() callback called in generic code right after a context switch and that these archs must implement to flush the prev task cputime and initialize the next task cputime related state. Signed-off-by: Frederic Weisbecker Acked-by: Martin Schwidefsky Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae3bcaa3afbf..78d9c965433a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1796,6 +1796,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul */ prev_state = prev->state; + account_switch_vtime(prev); finish_arch_switch(prev); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); -- cgit v1.2.2 From 044c782ce3a901fbd17cbe701c592f582381174d Mon Sep 17 00:00:00 2001 From: Valentin Ilie Date: Sun, 19 Aug 2012 00:52:42 +0300 Subject: workqueue: fix checkpatch issues Fixed some checkpatch warnings. tj: adapted to wq/for-3.7 and massaged pr_xxx() format strings a bit. Signed-off-by: Valentin Ilie Signed-off-by: Tejun Heo LKML-Reference: <1345326762-21747-1-git-send-email-valentin.ilie@gmail.com> --- kernel/workqueue.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7da24711038f..de429ba000ee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -269,18 +269,18 @@ struct workqueue_struct { }; struct workqueue_struct *system_wq __read_mostly; -struct workqueue_struct *system_highpri_wq __read_mostly; -struct workqueue_struct *system_long_wq __read_mostly; -struct workqueue_struct *system_nrt_wq __read_mostly; -struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezable_wq __read_mostly; -struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); +struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); +struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); +struct workqueue_struct *system_nrt_wq __read_mostly; EXPORT_SYMBOL_GPL(system_nrt_wq); +struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS @@ -2232,11 +2232,9 @@ __acquires(&gcwq->lock) lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); + pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" + " last function: %pf\n", + current->comm, preempt_count(), task_pid_nr(current), f); debug_show_held_locks(current); dump_stack(); } @@ -2790,8 +2788,8 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", - wq->name, flush_cnt); + pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); goto reflush; } @@ -3275,9 +3273,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; if (max_active < 1 || max_active > lim) - printk(KERN_WARNING "workqueue: max_active %d requested for %s " - "is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); return clamp_val(max_active, 1, lim); } -- cgit v1.2.2 From dbf2576e37da0fcc7aacbfbb9fd5d3de7888a3c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Aug 2012 14:51:23 -0700 Subject: workqueue: make all workqueues non-reentrant By default, each per-cpu part of a bound workqueue operates separately and a work item may be executing concurrently on different CPUs. The behavior avoids some cross-cpu traffic but leads to subtle weirdities and not-so-subtle contortions in the API. * There's no sane usefulness in allowing a single work item to be executed concurrently on multiple CPUs. People just get the behavior unintentionally and get surprised after learning about it. Most either explicitly synchronize or use non-reentrant/ordered workqueue but this is error-prone. * flush_work() can't wait for multiple instances of the same work item on different CPUs. If a work item is executing on cpu0 and then queued on cpu1, flush_work() can only wait for the one on cpu1. Unfortunately, work items can easily cross CPU boundaries unintentionally when the queueing thread gets migrated. This means that if multiple queuers compete, flush_work() can't even guarantee that the instance queued right before it is finished before returning. * flush_work_sync() was added to work around some of the deficiencies of flush_work(). In addition to the usual flushing, it ensures that all currently executing instances are finished before returning. This operation is expensive as it has to walk all CPUs and at the same time fails to address competing queuer case. Incorrectly using flush_work() when flush_work_sync() is necessary is an easy error to make and can lead to bugs which are difficult to reproduce. * Similar problems exist for flush_delayed_work[_sync](). Other than the cross-cpu access concern, there's no benefit in allowing parallel execution and it's plain silly to have this level of contortion for workqueue which is widely used from core code to extremely obscure drivers. This patch makes all workqueues non-reentrant. If a work item is executing on a different CPU when queueing is requested, it is always queued to that CPU. This guarantees that any given work item can be executing on one CPU at maximum and if a work item is queued and executing, both are on the same CPU. The only behavior change which may affect workqueue users negatively is that non-reentrancy overrides the affinity specified by queue_work_on(). On a reentrant workqueue, the affinity specified by queue_work_on() is always followed. Now, if the work item is executing on one of the CPUs, the work item will be queued there regardless of the requested affinity. I've reviewed all workqueue users which request explicit affinity, and, fortunately, none seems to be crazy enough to exploit parallel execution of the same work item. This adds an additional busy_hash lookup if the work item was previously queued on a different CPU. This shouldn't be noticeable under any sane workload. Work item queueing isn't a very high-frequency operation and they don't jump across CPUs all the time. In a micro benchmark to exaggerate this difference - measuring the time it takes for two work items to repeatedly jump between two CPUs a number (10M) of times with busy_hash table densely populated, the difference was around 3%. While the overhead is measureable, it is only visible in pathological cases and the difference isn't huge. This change brings much needed sanity to workqueue and makes its behavior consistent with timer. I think this is the right tradeoff to make. This enables significant simplification of workqueue API. Simplification patches will follow. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index de429ba000ee..c4feef9798ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1225,14 +1225,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, cpu = raw_smp_processor_id(); /* - * It's multi cpu. If @wq is non-reentrant and @work - * was previously on a different cpu, it might still - * be running there, in which case the work needs to - * be queued on that cpu to guarantee non-reentrance. + * It's multi cpu. If @work was previously on a different + * cpu, it might still be running there, in which case the + * work needs to be queued on that cpu to guarantee + * non-reentrancy. */ gcwq = get_gcwq(cpu); - if (wq->flags & WQ_NON_REENTRANT && - (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + last_gcwq = get_work_gcwq(work); + + if (last_gcwq && last_gcwq != gcwq) { struct worker *worker; spin_lock(&last_gcwq->lock); -- cgit v1.2.2 From 606a5020b9bdceb20b4f43e11db0054afa349028 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Aug 2012 14:51:23 -0700 Subject: workqueue: gut flush[_delayed]_work_sync() Now that all workqueues are non-reentrant, flush[_delayed]_work_sync() are equivalent to flush[_delayed]_work(). Drop the separate implementation and make them thin wrappers around flush[_delayed]_work(). * start_flush_work() no longer takes @wait_executing as the only left user - flush_work() - always sets it to %true. * __cancel_work_timer() uses flush_work() instead of wait_on_work(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 122 +++++------------------------------------------------ 1 file changed, 10 insertions(+), 112 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c4feef9798ea..5f13a9a2c792 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2801,8 +2801,7 @@ reflush: } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, - bool wait_executing) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct global_cwq *gcwq; @@ -2824,13 +2823,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, cwq = get_work_cwq(work); if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; - } else if (wait_executing) { + } else { worker = find_worker_executing_work(gcwq, work); if (!worker) goto already_gone; cwq = worker->current_cwq; - } else - goto already_gone; + } insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); @@ -2857,15 +2855,8 @@ already_gone: * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. + * Wait until @work has finished execution. @work is guaranteed to be idle + * on return if it hasn't been requeued since flush started. * * RETURNS: * %true if flush_work() waited for the work to finish execution, @@ -2878,85 +2869,15 @@ bool flush_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr, true)) { + if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return true; - } else - return false; -} -EXPORT_SYMBOL_GPL(flush_work); - -static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else + } else { return false; -} - -static bool wait_on_work(struct work_struct *work) -{ - bool ret = false; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - ret |= wait_on_cpu_work(get_gcwq(cpu), work); - return ret; -} - -/** - * flush_work_sync - wait until a work has finished execution - * @work: the work to flush - * - * Wait until @work has finished execution. On return, it's - * guaranteed that all queueing instances of @work which happened - * before this function is called are finished. In other words, if - * @work hasn't been requeued since this function was called, @work is - * guaranteed to be idle on return. - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work_sync(struct work_struct *work) -{ - struct wq_barrier barr; - bool pending, waited; - - /* we'll wait for executions separately, queue barr only if pending */ - pending = start_flush_work(work, &barr, false); - - /* wait for executions to finish */ - waited = wait_on_work(work); - - /* wait for the pending one */ - if (pending) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); } - - return pending || waited; } -EXPORT_SYMBOL_GPL(flush_work_sync); +EXPORT_SYMBOL_GPL(flush_work); static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { @@ -2970,14 +2891,14 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) * would be waiting for before retrying. */ if (unlikely(ret == -ENOENT)) - wait_on_work(work); + flush_work(work); } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ mark_work_canceling(work); local_irq_restore(flags); - wait_on_work(work); + flush_work(work); clear_work_data(work); return ret; } @@ -3029,29 +2950,6 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); -/** - * flush_delayed_work_sync - wait for a dwork to finish - * @dwork: the delayed work to flush - * - * Delayed timer is cancelled and the pending work is queued for - * execution immediately. Other than timer handling, its behavior - * is identical to flush_work_sync(). - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_delayed_work_sync(struct delayed_work *dwork) -{ - local_irq_disable(); - if (del_timer_sync(&dwork->timer)) - __queue_work(dwork->cpu, - get_work_cwq(&dwork->work)->wq, &dwork->work); - local_irq_enable(); - return flush_work_sync(&dwork->work); -} -EXPORT_SYMBOL(flush_delayed_work_sync); - /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel -- cgit v1.2.2 From ae930e0f4e66fd540c6fbad9f1e2a7743d8b9afe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Aug 2012 14:51:23 -0700 Subject: workqueue: gut system_nrt[_freezable]_wq() Now that all workqueues are non-reentrant, system[_freezable]_wq() are equivalent to system_nrt[_freezable]_wq(). Replace the latter with wrappers around system[_freezable]_wq(). The wrapping goes through inline functions so that __deprecated can be added easily. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f13a9a2c792..85bd3409b9f5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -274,14 +274,10 @@ struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); -struct workqueue_struct *system_nrt_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_nrt_wq); struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); -struct workqueue_struct *system_nrt_freezable_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include @@ -3838,16 +3834,12 @@ static int __init init_workqueues(void) system_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); - system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); - system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", - WQ_NON_REENTRANT | WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_nrt_wq || !system_unbound_wq || !system_freezable_wq || - !system_nrt_freezable_wq); + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.2 From 3b07e9ca26866697616097044f25fbe53dbab693 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Aug 2012 14:51:24 -0700 Subject: workqueue: deprecate system_nrt[_freezable]_wq system_nrt[_freezable]_wq are now spurious. Mark them deprecated and convert all users to system[_freezable]_wq. If you're cc'd and wondering what's going on: Now all workqueues are non-reentrant, so there's no reason to use system_nrt[_freezable]_wq. Please use system[_freezable]_wq instead. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Acked-By: Lai Jiangshan Cc: Jens Axboe Cc: David Airlie Cc: Jiri Kosina Cc: "David S. Miller" Cc: Rusty Russell Cc: "Paul E. McKenney" Cc: David Howells --- kernel/srcu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 2095be3318d5..97c465ebd844 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - queue_delayed_work(system_nrt_wq, &sp->work, 0); + schedule_delayed_work(&sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); + schedule_delayed_work(&sp->work, SRCU_INTERVAL); } /* -- cgit v1.2.2 From b3ae66f209e8929db62b5a5f874ab2cdcf5ef1d4 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 30 Jul 2012 22:39:06 -0700 Subject: genirq: Export irq_set_chip_and_handler_name() Export irq_set_chip_and_handler_name() to modules to allow them to do things such as irq_set_chip_and_handler(....); This fixes ERROR: "irq_set_chip_and_handler_name" \ [drivers/gpio/gpio-pcf857x.ko] undefined! when gpio-pcf857x.c is being built as a module. Signed-off-by: Kuninori Morimoto Cc: Linus Walleij Cc: Stephen Rothwell Cc: Greg KH Link: http://lkml.kernel.org/r/873948trpk.wl%25kuninori.morimoto.gx@renesas.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eebd6d5cfb44..57d86d07221e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -671,6 +671,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_set_chip(irq, chip); __irq_set_handler(irq, handle, 0, name); } +EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { -- cgit v1.2.2 From 17d83127d4c2b322dd8f217e0ac08c66eb403779 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 30 Jul 2012 22:39:20 -0700 Subject: genirq: Export dummy_irq_chip Export dummy_irq_chip to modules to allow them to do things such as irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_level_irq); This fixes ERROR: "dummy_irq_chip" [drivers/gpio/gpio-pcf857x.ko] undefined! when gpio-pcf857x.c is being built as a module. Signed-off-by: Kuninori Morimoto Cc: Linus Walleij Cc: Stephen Rothwell Cc: Greg KH Link: http://lkml.kernel.org/r/871ujstrp6.wl%25kuninori.morimoto.gx@renesas.com Signed-off-by: Thomas Gleixner --- kernel/irq/dummychip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index b5fcd96c7102..988dc58e8847 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -6,6 +6,7 @@ */ #include #include +#include #include "internals.h" @@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = { .irq_mask = noop, .irq_unmask = noop, }; +EXPORT_SYMBOL_GPL(dummy_irq_chip); -- cgit v1.2.2 From e52b1db37b89b69ceb08b521a808bd2cf4724481 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Aug 2012 11:10:25 -0700 Subject: timer: Generalize timer->base flags handling To prepare for addition of another flag, generalize timer->base flags handling. * Rename from TBASE_*_FLAG to TIMER_* and make them LU constants. * Define and use TIMER_FLAG_MASK for flags masking so that multiple flags can be handled correctly. * Don't dereference timer->base directly even if !tbase_get_deferrable(). All two such places are already passed in @base, so use it instead. * Make sure tvec_base's alignment is large enough for timer->base flags using BUILD_BUG_ON(). Signed-off-by: Tejun Heo Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1344449428-24962-2-git-send-email-tj@kernel.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eba..cf7af56940b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -92,12 +92,12 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); + return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); } static inline struct tvec_base *tbase_get_base(struct tvec_base *base) { - return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); + return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); } static inline void timer_set_deferrable(struct timer_list *timer) @@ -108,8 +108,9 @@ static inline void timer_set_deferrable(struct timer_list *timer) static inline void timer_set_base(struct timer_list *timer, struct tvec_base *new_base) { - timer->base = (struct tvec_base *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; + + timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); } static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -686,7 +687,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) - timer->base->active_timers--; + base->active_timers--; } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -697,7 +698,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, detach_timer(timer, clear_pending); if (!tbase_get_deferrable(timer->base)) { - timer->base->active_timers--; + base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } @@ -1800,9 +1801,13 @@ static struct notifier_block __cpuinitdata timers_nb = { void __init init_timers(void) { - int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); + int err; + + /* ensure there are enough low bits for flags in timer->base pointer */ + BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); + err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); init_timer_stats(); BUG_ON(err != NOTIFY_OK); -- cgit v1.2.2 From fc683995a6c4e604d62ab9a488ac2c1ba94fa868 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Aug 2012 11:10:27 -0700 Subject: timer: Clean up timer initializers Over time, timer initializers became messy with unnecessarily duplicated code which are inconsistently spread across timer.h and timer.c. This patch cleans up timer initializers. * timer.c::__init_timer() is renamed to do_init_timer(). * __TIMER_INITIALIZER() added. It takes @flags and all initializers are wrappers around it. * init_timer[_on_stack]_key() now take @flags. * __init_timer[_on_stack]() added. They take @flags and all init macros are wrappers around them. * __setup_timer[_on_stack]() added. It uses __init_timer() and takes @flags. All setup macros are wrappers around the two. Note that this patch doesn't add missing init/setup combinations - e.g. init_timer_deferrable_on_stack(). Adding missing ones is trivial. Signed-off-by: Tejun Heo Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1344449428-24962-4-git-send-email-tj@kernel.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 56 ++++++++++++++------------------------------------------ 1 file changed, 14 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index cf7af56940b7..8d185a1677cc 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -100,11 +100,6 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); } -static inline void timer_set_deferrable(struct timer_list *timer) -{ - timer->base = TBASE_MAKE_DEFERRED(timer->base); -} - static inline void timer_set_base(struct timer_list *timer, struct tvec_base *new_base) { @@ -564,16 +559,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer) debug_object_assert_init(timer, &timer_debug_descr); } -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key); +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer, name, key); + do_init_timer(timer, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); @@ -614,12 +607,13 @@ static inline void debug_assert_init(struct timer_list *timer) debug_timer_assert_init(timer); } -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { + struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + timer->entry.next = NULL; - timer->base = __raw_get_cpu_var(tvec_bases); + timer->base = (void *)((unsigned long)base | flags); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -629,22 +623,10 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } -void setup_deferrable_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key, - void (*function)(unsigned long), - unsigned long data) -{ - timer->function = function; - timer->data = data; - init_timer_on_stack_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); - /** * init_timer_key - initialize a timer * @timer: the timer to be initialized + * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies @@ -652,24 +634,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +void init_timer_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { debug_init(timer); - __init_timer(timer, name, key); + do_init_timer(timer, flags, name, key); } EXPORT_SYMBOL(init_timer_key); -void init_timer_deferrable_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - init_timer_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL(init_timer_deferrable_key); - static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; -- cgit v1.2.2 From c5f66e99b7cb091e3d51ae8e8156892e8feb7fa3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Aug 2012 11:10:28 -0700 Subject: timer: Implement TIMER_IRQSAFE Timer internals are protected with irq-safe locks but timer execution isn't, so a timer being dequeued for execution and its execution aren't atomic against IRQs. This makes it impossible to wait for its completion from IRQ handlers and difficult to shoot down a timer from IRQ handlers. This issue caused some issues for delayed_work interface. Because there's no way to reliably shoot down delayed_work->timer from IRQ handlers, __cancel_delayed_work() can't share the logic to steal the target delayed_work with cancel_delayed_work_sync(), and can only steal delayed_works which are on queued on timer. Similarly, the pending mod_delayed_work() can't be used from IRQ handlers. This patch adds a new timer flag TIMER_IRQSAFE, which makes the timer to be executed without enabling IRQ after dequeueing such that its dequeueing and execution are atomic against IRQ handlers. This makes it safe to wait for the timer's completion from IRQ handlers, for example, using del_timer_sync(). It can never be executing on the local CPU and if executing on other CPUs it won't be interrupted until done. This will enable simplifying delayed_work cancel/mod interface. Signed-off-by: Tejun Heo Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1344449428-24962-5-git-send-email-tj@kernel.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 8d185a1677cc..706fe4c53e82 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -95,6 +95,11 @@ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); } +static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); +} + static inline struct tvec_base *tbase_get_base(struct tvec_base *base) { return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); @@ -1002,14 +1007,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. The timer's handler must not call - * add_timer_on(). Upon exit the timer is not queued and the handler is - * not running on any CPU. + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's + * handler. The timer's handler must not call add_timer_on(). Upon exit the + * timer is not queued and the handler is not running on any CPU. * - * Note: You must not hold locks that are held in interrupt context - * while calling this function. Even if the lock has nothing to do - * with the timer in question. Here's why: + * Note: For !irqsafe timers, you must not hold locks that are held in + * interrupt context while calling this function. Even if the lock has + * nothing to do with the timer in question. Here's why: * * CPU0 CPU1 * ---- ---- @@ -1046,7 +1051,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq()); + WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1153,19 +1158,27 @@ static inline void __run_timers(struct tvec_base *base) while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; + bool irqsafe; timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; + irqsafe = tbase_get_irqsafe(timer->base); timer_stats_account_timer(timer); base->running_timer = timer; detach_expired_timer(timer, base); - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); + if (irqsafe) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } } } base->running_timer = NULL; -- cgit v1.2.2 From e0aecdd874d78b7129a64b056c20e529e2c916df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Aug 2012 13:18:24 -0700 Subject: workqueue: use irqsafe timer for delayed_work Up to now, for delayed_works, try_to_grab_pending() couldn't be used from IRQ handlers because IRQs may happen while delayed_work_timer_fn() is in progress leading to indefinite -EAGAIN. This patch makes delayed_work use the new TIMER_IRQSAFE flag for delayed_work->timer. This makes try_to_grab_pending() and thus mod_delayed_work_on() safe to call from IRQ handlers. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 85bd3409b9f5..b394df8beaee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1048,16 +1048,14 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, * for arbitrarily long * * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting - * preempted while holding PENDING and @work off queue, preemption must be - * disabled on entry. This ensures that we don't return -EAGAIN while - * another task is preempted in this function. + * interrupted while holding PENDING and @work off queue, irq must be + * disabled on entry. This, combined with delayed_work->timer being + * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is * responsible for releasing it using local_irq_restore(*@flags). * - * This function is safe to call from any context other than IRQ handler. - * An IRQ handler may run on top of delayed_work_timer_fn() which can make - * this function return -EAGAIN perpetually. + * This function is safe to call from any context including IRQ handler. */ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) @@ -1072,6 +1070,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (is_dwork) { struct delayed_work *dwork = to_delayed_work(work); + /* + * dwork->timer is irqsafe. If del_timer() fails, it's + * guaranteed that the timer is not queued anywhere and not + * running on the local CPU. + */ if (likely(del_timer(&dwork->timer))) return 1; } @@ -1327,9 +1330,8 @@ void delayed_work_timer_fn(unsigned long __data) struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - local_irq_disable(); + /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, cwq->wq, &dwork->work); - local_irq_enable(); } EXPORT_SYMBOL_GPL(delayed_work_timer_fn); @@ -1444,7 +1446,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * Returns %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * - * This function is safe to call from any context other than IRQ handler. + * This function is safe to call from any context including IRQ handler. * See try_to_grab_pending() for details. */ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, -- cgit v1.2.2 From 57b30ae77bf00d2318df711ef9a4d2a9be0a3a2a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Aug 2012 13:18:24 -0700 Subject: workqueue: reimplement cancel_delayed_work() using try_to_grab_pending() cancel_delayed_work() can't be called from IRQ handlers due to its use of del_timer_sync() and can't cancel work items which are already transferred from timer to worklist. Also, unlike other flush and cancel functions, a canceled delayed_work would still point to the last associated cpu_workqueue. If the workqueue is destroyed afterwards and the work item is re-used on a different workqueue, the queueing code can oops trying to dereference already freed cpu_workqueue. This patch reimplements cancel_delayed_work() using try_to_grab_pending() and set_work_cpu_and_clear_pending(). This allows the function to be called from IRQ handlers and makes its behavior consistent with other flush / cancel functions. Signed-off-by: Tejun Heo Cc: Linus Torvalds Cc: Ingo Molnar Cc: Andrew Morton --- kernel/workqueue.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b394df8beaee..039d0fae171a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2948,6 +2948,36 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +/** + * cancel_delayed_work - cancel a delayed work + * @dwork: delayed_work to cancel + * + * Kill off a pending delayed_work. Returns %true if @dwork was pending + * and canceled; %false if wasn't pending. Note that the work callback + * function may still be running on return, unless it returns %true and the + * work doesn't re-arm itself. Explicitly flush or use + * cancel_delayed_work_sync() to wait on it. + * + * This function is safe to call from any context including IRQ handler. + */ +bool cancel_delayed_work(struct delayed_work *dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); + local_irq_restore(flags); + return true; +} +EXPORT_SYMBOL(cancel_delayed_work); + /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel -- cgit v1.2.2 From 5834ec3aea8a84b70efeb52ee91a8f8b1042cd2a Mon Sep 17 00:00:00 2001 From: Sedat Dilek Date: Thu, 23 Aug 2012 02:47:13 +0200 Subject: PM / Freezer: Fix small typo "regrigerator" Noticed when digging into a suspend issue in linux-next (next-20120821). For more details see . Signed-off-by: Sedat Dilek Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 19db29f67558..87da817f9e13 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the regrigerator. + * time to enter the refrigerator. */ msleep(10); } -- cgit v1.2.2 From 816afe4ff98ee10b1d30fd66361be132a0a5cee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 6 Aug 2012 17:29:49 +0930 Subject: x86/smp: Don't ever patch back to UP if we unplug cpus We still patch SMP instructions to UP variants if we boot with a single CPU, but not at any other time. In particular, not if we unplug CPUs to return to a single cpu. Paul McKenney points out: mean offline overhead is 6251/48=130.2 milliseconds. If I remove the alternatives_smp_switch() from the offline path [...] the mean offline overhead is 550/42=13.1 milliseconds Basically, we're never going to get those 120ms back, and the code is pretty messy. We get rid of: 1) The "smp-alt-once" boot option. It's actually "smp-alt-boot", the documentation is wrong. It's now the default. 2) The skip_smp_alternatives flag used by suspend. 3) arch_disable_nonboot_cpus_begin() and arch_disable_nonboot_cpus_end() which were only used to set this one flag. Signed-off-by: Rusty Russell Cc: Paul McKenney Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/87vcgwwive.fsf@rustcorp.com.au Signed-off-by: Ingo Molnar --- kernel/cpu.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 14d32588cccd..f6bfe3e03f6b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -439,14 +439,6 @@ EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -void __weak arch_disable_nonboot_cpus_begin(void) -{ -} - -void __weak arch_disable_nonboot_cpus_end(void) -{ -} - int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; @@ -458,7 +450,6 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - arch_disable_nonboot_cpus_begin(); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { @@ -474,8 +465,6 @@ int disable_nonboot_cpus(void) } } - arch_disable_nonboot_cpus_end(); - if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ -- cgit v1.2.2 From a2546fae01124fb8063747439300fcf39bac033a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Feb 2011 13:15:59 -0500 Subject: ftrace: Add -mfentry to Makefile on function tracer Thanks to Andi Kleen, gcc 4.6.0 now supports -mfentry for x86 (and hopefully soon for other archs). What this does is to have the function profiler start at the beginning of the function instead of after the stack is set up. As plain -pg (mcount) is called after the stack is set up, and in some cases can have issues with the function graph tracer. It also requires frame pointers to be enabled. The -mfentry now calls __fentry__ at the beginning of the function. This allows for compiling without frame pointers and even has the ability to access parameters if needed. If the architecture and the compiler both support -mfentry then use that instead. Link: http://lkml.kernel.org/r/20120807194059.392617243@goodmis.org Acked-by: H. Peter Anvin Acked-by: Ingo Molnar Cc: Michal Marek Cc: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c07071cc5..9301a0e35e0c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + config HAVE_C_RECORDMCOUNT bool help -- cgit v1.2.2 From 781d06248234e221edb560a18461d65808a8a942 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Feb 2011 13:27:22 -0500 Subject: ftrace: Do not test frame pointers if -mfentry is used The function graph has a test to check if the frame pointer is corrupted, which can happen with various options of gcc with mcount. But this is not an issue with -mfentry as -mfentry does not need nor use frame pointers for function graph tracing. Link: http://lkml.kernel.org/r/20120807194059.773895870@goodmis.org Acked-by: H. Peter Anvin Acked-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ce27c8ba8d31..99b4378393d5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, * * Currently, x86_32 with optimize for size (-Os) makes the latest * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. */ if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); -- cgit v1.2.2 From c9235f4872e810d43bf1b19b92cdbe0ec282bada Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Apr 2012 17:06:34 -0700 Subject: userns: Make credential debugging user namespace safe. Cc: David Howells Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/cred.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d82..48cea3da6d05 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, atomic_read(&cred->usage), read_cred_subscribers(cred)); printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", - cred->uid, cred->euid, cred->suid, cred->fsuid); + from_kuid_munged(&init_user_ns, cred->uid), + from_kuid_munged(&init_user_ns, cred->euid), + from_kuid_munged(&init_user_ns, cred->suid), + from_kuid_munged(&init_user_ns, cred->fsuid)); printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kgid_munged(&init_user_ns, cred->gid), + from_kgid_munged(&init_user_ns, cred->egid), + from_kgid_munged(&init_user_ns, cred->sgid), + from_kgid_munged(&init_user_ns, cred->fsgid)); #ifdef CONFIG_SECURITY printk(KERN_ERR "CRED: ->security is %p\n", cred->security); if ((unsigned long) cred->security >= PAGE_SIZE && -- cgit v1.2.2 From 13af07df9b7e49f1987cf36aa048dc6c49d0f93d Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:29 -0400 Subject: cgroup: revise how we re-populate root directory When remounting cgroupfs with some subsystems added to it and some removed, cgroup will remove all the files in root directory and then re-popluate it. What I'm doing here is, only remove files which belong to subsystems that are to be unbinded, and only create files for newly-added subsystems. The purpose is to have all other files untouched. This is a preparation for cgroup xattr support. v7: - checkpatch warnings fixed v6: - no changes v5: - no changes v4: - refactored cgroup_clear_directory() to not use cgroup_rm_file() - instead of going thru the list of files, get the file list using the subsystems - use 'subsys_mask' instead of {added,removed}_bits and made cgroup_populate_dir() to match the parameters with cgroup_clear_directory() v3: - refresh patches after recent refactoring Original-patch-by: Li Zefan Cc: Li Zefan Cc: Hugh Dickins Cc: Hillf Danton Cc: Lennart Poettering Signed-off-by: Li Zefan Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- kernel/cgroup.c | 61 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..875a7130647c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -824,7 +824,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp); +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -963,12 +964,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) return -ENOENT; } -static void cgroup_clear_directory(struct dentry *dir) +/** + * cgroup_clear_directory - selective removal of base and subsystem files + * @dir: directory containing the files + * @base_files: true if the base files should be removed + * @subsys_mask: mask of the subsystem ids whose files should be removed + */ +static void cgroup_clear_directory(struct dentry *dir, bool base_files, + unsigned long subsys_mask) { struct cgroup *cgrp = __d_cgrp(dir); + struct cgroup_subsys *ss; - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); + for_each_subsys(cgrp->root, ss) { + struct cftype_set *set; + if (!test_bit(ss->subsys_id, &subsys_mask)) + continue; + list_for_each_entry(set, &ss->cftsets, node) + cgroup_rm_file(cgrp, set->cfts); + } + if (base_files) { + while (!list_empty(&cgrp->files)) + cgroup_rm_file(cgrp, NULL); + } } /* @@ -977,8 +995,9 @@ static void cgroup_clear_directory(struct dentry *dir) static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - cgroup_clear_directory(dentry); + cgroup_clear_directory(dentry, true, root->subsys_bits); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1339,6 +1358,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; + unsigned long added_bits, removed_bits; mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1354,6 +1374,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); + added_bits = opts.subsys_bits & ~root->subsys_bits; + removed_bits = root->subsys_bits & ~opts.subsys_bits; + /* Don't allow flags or name to change at remount */ if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { @@ -1369,8 +1392,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) } /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry); - cgroup_populate_dir(cgrp); + cgroup_clear_directory(cgrp->dentry, false, removed_bits); + /* re-populate subsystem files */ + cgroup_populate_dir(cgrp, false, added_bits); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1669,7 +1693,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp); + cgroup_populate_dir(root_cgrp, true, root->subsys_bits); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -3843,18 +3867,29 @@ static struct cftype files[] = { { } /* terminate */ }; -static int cgroup_populate_dir(struct cgroup *cgrp) +/** + * cgroup_populate_dir - selectively creation of files in a directory + * @cgrp: target cgroup + * @base_files: true if the base files should be added + * @subsys_mask: mask of the subsystem ids whose files should be added + */ +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { int err; struct cgroup_subsys *ss; - err = cgroup_addrm_files(cgrp, NULL, files, true); - if (err < 0) - return err; + if (base_files) { + err = cgroup_addrm_files(cgrp, NULL, files, true); + if (err < 0) + return err; + } /* process cftsets of each subsystem */ for_each_subsys(cgrp->root, ss) { struct cftype_set *set; + if (!test_bit(ss->subsys_id, &subsys_mask)) + continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, ss, set->cfts, true); @@ -3988,7 +4023,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail(&cgrp->allcg_node, &root->allcg_list); - err = cgroup_populate_dir(cgrp); + err = cgroup_populate_dir(cgrp, true, root->subsys_bits); /* If err < 0, we have a half-filled directory - oh well ;) */ mutex_unlock(&cgroup_mutex); -- cgit v1.2.2 From 03b1cde6b22f625ae832b939bc7379ec1466aec5 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:30 -0400 Subject: cgroup: add xattr support This is one of the items in the plumber's wish list. For use cases: >> What would the use case be for this? > > Attaching meta information to services, in an easily discoverable > way. For example, in systemd we create one cgroup for each service, and > could then store data like the main pid of the specific service as an > xattr on the cgroup itself. That way we'd have almost all service state > in the cgroupfs, which would make it possible to terminate systemd and > later restart it without losing any state information. But there's more: > for example, some very peculiar services cannot be terminated on > shutdown (i.e. fakeraid DM stuff) and it would be really nice if the > services in question could just mark that on their cgroup, by setting an > xattr. On the more desktopy side of things there are other > possibilities: for example there are plans defining what an application > is along the lines of a cgroup (i.e. an app being a collection of > processes). With xattrs one could then attach an icon or human readable > program name on the cgroup. > > The key idea is that this would allow attaching runtime meta information > to cgroups and everything they model (services, apps, vms), that doesn't > need any complex userspace infrastructure, has good access control > (i.e. because the file system enforces that anyway, and there's the > "trusted." xattr namespace), notifications (inotify), and can easily be > shared among applications. > > Lennart v7: - no changes v6: - remove user xattr namespace, only allow trusted and security v5: - check for capabilities before setting/removing xattrs v4: - no changes v3: - instead of config option, use mount option to enable xattr support Original-patch-by: Li Zefan Cc: Li Zefan Cc: Tejun Heo Cc: Hugh Dickins Cc: Hillf Danton Cc: Lennart Poettering Signed-off-by: Li Zefan Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- kernel/cgroup.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 875a7130647c..508b4a97ab19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -276,7 +276,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) /* bits in struct cgroupfs_root flags field */ enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_XATTR, /* supports extended attributes */ }; static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -913,15 +914,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); + simple_xattrs_free(&cgrp->xattrs); + kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; + struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); kfree(cfe); + simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -1140,6 +1145,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); + if (test_bit(ROOT_XATTR, &root->flags)) + seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); if (clone_children(&root->top_cgroup)) @@ -1208,6 +1215,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->clone_children = true; continue; } + if (!strcmp(token, "xattr")) { + set_bit(ROOT_XATTR, &opts->flags); + continue; + } if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1425,6 +1436,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); + simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1769,6 +1781,8 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + simple_xattrs_free(&cgrp->xattrs); + kill_litter_super(sb); cgroup_drop_root(root); } @@ -2575,6 +2589,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } +static struct simple_xattrs *__d_xattrs(struct dentry *dentry) +{ + if (S_ISDIR(dentry->d_inode->i_mode)) + return &__d_cgrp(dentry)->xattrs; + else + return &__d_cft(dentry)->xattrs; +} + +static inline int xattr_enabled(struct dentry *dentry) +{ + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + return test_bit(ROOT_XATTR, &root->flags); +} + +static bool is_valid_xattr(const char *name) +{ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + return true; + return false; +} + +static int cgroup_setxattr(struct dentry *dentry, const char *name, + const void *val, size_t size, int flags) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); +} + +static int cgroup_removexattr(struct dentry *dentry, const char *name) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_remove(__d_xattrs(dentry), name); +} + +static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, + void *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_get(__d_xattrs(dentry), name, buf, size); +} + +static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + return simple_xattr_list(__d_xattrs(dentry), buf, size); +} + static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, @@ -2583,11 +2655,22 @@ static const struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; +static const struct inode_operations cgroup_file_inode_operations = { + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, +}; + static const struct inode_operations cgroup_dir_inode_operations = { .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, }; static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -2635,6 +2718,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; + inode->i_op = &cgroup_file_inode_operations; } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ @@ -2695,7 +2779,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype *cft) + struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2705,6 +2789,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + simple_xattrs_init(&cft->xattrs); + /* does @cft->flags tell us to skip creation on @cgrp? */ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) return 0; @@ -2745,9 +2831,9 @@ out: } static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype cfts[], bool is_add) + struct cftype cfts[], bool is_add) { - const struct cftype *cft; + struct cftype *cft; int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2781,7 +2867,7 @@ static void cgroup_cfts_prepare(void) } static void cgroup_cfts_commit(struct cgroup_subsys *ss, - const struct cftype *cfts, bool is_add) + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) { LIST_HEAD(pending); @@ -2832,7 +2918,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; @@ -2862,7 +2948,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered with @ss. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; -- cgit v1.2.2 From a1a71b45a66fd3c3c453b55fbd180f8fccdd1daa Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:31 -0400 Subject: cgroup: rename subsys_bits to subsys_mask In a previous discussion, Tejun Heo suggested to rename references to subsys_bits (added_bits, removed_bits, etc) by something more meaningful. Cc: Li Zefan Cc: Tejun Heo Cc: Hugh Dickins Cc: Hillf Danton Cc: Lennart Poettering Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- kernel/cgroup.c | 84 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 508b4a97ab19..ced292d720b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -111,13 +111,13 @@ struct cgroupfs_root { * The bitmask of subsystems intended to be attached to this * hierarchy */ - unsigned long subsys_bits; + unsigned long subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_bits; + unsigned long actual_subsys_mask; /* A list running through the attached subsystems */ struct list_head subsys_list; @@ -557,7 +557,7 @@ static struct css_set *find_existing_css_set( * won't change, so no need for locking. */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -1002,7 +1002,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) struct dentry *parent; struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - cgroup_clear_directory(dentry, true, root->subsys_bits); + cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1046,22 +1046,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_bits) + unsigned long final_subsys_mask) { - unsigned long added_bits, removed_bits; + unsigned long added_mask, removed_mask; struct cgroup *cgrp = &root->top_cgroup; int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - removed_bits = root->actual_subsys_bits & ~final_bits; - added_bits = final_bits & ~root->actual_subsys_bits; + removed_mask = root->actual_subsys_mask & ~final_subsys_mask; + added_mask = final_subsys_mask & ~root->actual_subsys_mask; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_bits)) + if (!(bit & added_mask)) continue; /* * Nobody should tell us to do a subsys that doesn't exist: @@ -1086,7 +1086,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; unsigned long bit = 1UL << i; - if (bit & added_bits) { + if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); @@ -1099,7 +1099,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgrp); /* refcount was already taken, and we're keeping it */ - } else if (bit & removed_bits) { + } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); @@ -1112,7 +1112,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &rootnode.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); - } else if (bit & final_bits) { + } else if (bit & final_subsys_mask) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); @@ -1129,7 +1129,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); } } - root->subsys_bits = root->actual_subsys_bits = final_bits; + root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; synchronize_rcu(); return 0; @@ -1158,7 +1158,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) } struct cgroup_sb_opts { - unsigned long subsys_bits; + unsigned long subsys_mask; unsigned long flags; char *release_agent; bool clone_children; @@ -1267,7 +1267,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_bits); + set_bit(i, &opts->subsys_mask); one_ss = true; break; @@ -1288,7 +1288,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (ss->disabled) continue; - set_bit(i, &opts->subsys_bits); + set_bit(i, &opts->subsys_mask); } } @@ -1300,19 +1300,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * the cpuset subsystem. */ if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_bits & mask)) + (opts->subsys_mask & mask)) return -EINVAL; /* Can't specify "none" and some subsystems */ - if (opts->subsys_bits && opts->none) + if (opts->subsys_mask && opts->none) return -EINVAL; /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_bits && !opts->name) + if (!opts->subsys_mask && !opts->name) return -EINVAL; /* @@ -1324,7 +1324,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - if (!(bit & opts->subsys_bits)) + if (!(bit & opts->subsys_mask)) continue; if (!try_module_get(subsys[i]->module)) { module_pin_failed = true; @@ -1341,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* drop refcounts only on the ones we took */ unsigned long bit = 1UL << i; - if (!(bit & opts->subsys_bits)) + if (!(bit & opts->subsys_mask)) continue; module_put(subsys[i]->module); } @@ -1351,13 +1351,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_bits) +static void drop_parsed_module_refcounts(unsigned long subsys_mask) { int i; for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - if (!(bit & subsys_bits)) + if (!(bit & subsys_mask)) continue; module_put(subsys[i]->module); } @@ -1369,7 +1369,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; - unsigned long added_bits, removed_bits; + unsigned long added_mask, removed_mask; mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1381,31 +1381,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; /* See feature-removal-schedule.txt */ - if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) + if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_bits = opts.subsys_bits & ~root->subsys_bits; - removed_bits = root->subsys_bits & ~opts.subsys_bits; + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - ret = rebind_subsystems(root, opts.subsys_bits); + ret = rebind_subsystems(root, opts.subsys_mask); if (ret) { - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry, false, removed_bits); + cgroup_clear_directory(cgrp->dentry, false, removed_mask); /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_bits); + cgroup_populate_dir(cgrp, false, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1491,8 +1491,8 @@ static int cgroup_test_super(struct super_block *sb, void *data) * If we asked for subsystems (or explicitly for no * subsystems) then they must match */ - if ((opts->subsys_bits || opts->none) - && (opts->subsys_bits != root->subsys_bits)) + if ((opts->subsys_mask || opts->none) + && (opts->subsys_mask != root->subsys_mask)) return 0; return 1; @@ -1502,7 +1502,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; - if (!opts->subsys_bits && !opts->none) + if (!opts->subsys_mask && !opts->none) return NULL; root = kzalloc(sizeof(*root), GFP_KERNEL); @@ -1515,7 +1515,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) } init_cgroup_root(root); - root->subsys_bits = opts->subsys_bits; + root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); @@ -1547,7 +1547,7 @@ static int cgroup_set_super(struct super_block *sb, void *data) if (!opts->new_root) return -EINVAL; - BUG_ON(!opts->subsys_bits && !opts->none); + BUG_ON(!opts->subsys_mask && !opts->none); ret = set_anon_super(sb, NULL); if (ret) @@ -1665,7 +1665,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; - ret = rebind_subsystems(root, root->subsys_bits); + ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { free_cg_links(&tmp_cg_links); goto unlock_drop; @@ -1705,7 +1705,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_bits); + cgroup_populate_dir(root_cgrp, true, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -1717,7 +1717,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1731,7 +1731,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, drop_new_super: deactivate_locked_super(sb); drop_modules: - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -4109,7 +4109,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail(&cgrp->allcg_node, &root->allcg_list); - err = cgroup_populate_dir(cgrp, true, root->subsys_bits); + err = cgroup_populate_dir(cgrp, true, root->subsys_mask); /* If err < 0, we have a half-filled directory - oh well ;) */ mutex_unlock(&cgroup_mutex); -- cgit v1.2.2 From 61e1d394984110e2e76f25572d5b1b5d48796751 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 1 Jun 2012 14:49:50 +0530 Subject: uprobes: Remove redundant lock_page/unlock_page Since read_opcode() reads from the referenced page and doesnt modify the page contents nor the page attributes, there is no need to lock the page. Signed-off-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c08a22d02f72..7cff24c60dd7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -280,12 +280,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ if (ret <= 0) return ret; - lock_page(page); vaddr_new = kmap_atomic(page); vaddr &= ~PAGE_MASK; memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); - unlock_page(page); put_page(page); -- cgit v1.2.2 From 8bd874456e2ec49b9e64372ddc89a6f88901d184 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Aug 2012 18:12:30 +0200 Subject: uprobes: Remove check for uprobe variable in handle_swbp() by the time we get here (after we pass cleanup_ret) uprobe is always is set. If it is NULL we leave very early in the code. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7cff24c60dd7..0cefde276641 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1516,17 +1516,15 @@ cleanup_ret: utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; } - if (uprobe) { - if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) + if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); - put_uprobe(uprobe); - } + put_uprobe(uprobe); } /* -- cgit v1.2.2 From 647c42dfd40fec032a4c8525a755160f0765921f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 6 Aug 2012 13:15:09 +0200 Subject: uprobes: Kill uprobes_state->count uprobes_state->count is only needed to avoid the slow path in uprobe_pre_sstep_notifier(). It is also checked in uprobe_munmap() but ironically its only goal to decrement this counter. However, it is very broken. Just some examples: - uprobe_mmap() can race with uprobe_unregister() and wrongly increment the counter if it hits the non-uprobe "int3". Note that install_breakpoint() checks ->consumers first and returns -EEXIST if it is NULL. "atomic_sub() if error" in uprobe_mmap() looks obviously wrong too. - uprobe_munmap() can race with uprobe_register() and wrongly decrement the counter by the same reason. - Suppose an appication tries to increase the mmapped area via sys_mremap(). vma_adjust() does uprobe_munmap(whole_vma) first, this can nullify the counter temporarily and race with another thread which can hit the bp, the application will be killed by SIGTRAP. - Suppose an application mmaps 2 consecutive areas in the same file and one (or both) of these areas has uprobes. In the likely case mmap_region()->vma_merge() suceeds. Like above, this leads to uprobe_munmap/uprobe_mmap from vma_merge()->vma_adjust() but then mmap_region() does another uprobe_mmap(resulting_vma) and doubles the counter. This patch only removes this counter and fixes the compile errors, then we will try to cleanup the changed code and add something else instead. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 38 ++------------------------------------ 1 file changed, 2 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0cefde276641..6f1664d217dc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -678,18 +678,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, uprobe->flags |= UPROBE_COPY_INSN; } - /* - * Ideally, should be updating the probe count after the breakpoint - * has been successfully inserted. However a thread could hit the - * breakpoint we just inserted even before the probe count is - * incremented. If this is the first breakpoint placed, breakpoint - * notifier might ignore uprobes and pass the trap to the thread. - * Hence increment before and decrement on failure. - */ - atomic_inc(&mm->uprobes_state.count); ret = set_swbp(&uprobe->arch, mm, vaddr); - if (ret) - atomic_dec(&mm->uprobes_state.count); return ret; } @@ -697,8 +686,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) - atomic_dec(&mm->uprobes_state.count); + set_orig_insn(&uprobe->arch, mm, vaddr, true); } /* @@ -1051,13 +1039,6 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!is_swbp_at_addr(vma->vm_mm, vaddr)) continue; - - /* - * Unable to insert a breakpoint, but - * breakpoint lies underneath. Increment the - * probe count. - */ - atomic_inc(&vma->vm_mm->uprobes_state.count); } if (!ret) @@ -1068,9 +1049,6 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_unlock(uprobes_mmap_hash(inode)); - if (ret) - atomic_sub(count, &vma->vm_mm->uprobes_state.count); - return ret; } @@ -1089,9 +1067,6 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!atomic_read(&vma->vm_mm->uprobes_state.count)) - return; - inode = vma->vm_file->f_mapping->host; if (!inode) return; @@ -1100,13 +1075,6 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon build_probe_list(inode, vma, start, end, &tmp_list); list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); @@ -1217,7 +1185,6 @@ void uprobe_clear_state(struct mm_struct *mm) void uprobe_reset_state(struct mm_struct *mm) { mm->uprobes_state.xol_area = NULL; - atomic_set(&mm->uprobes_state.count, 0); } /* @@ -1585,8 +1552,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask; - if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) - /* task is currently not uprobed */ + if (!current->mm) return 0; utask = current->utask; -- cgit v1.2.2 From f1a45d023193f7d8e55e384090b645d609325393 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 6 Aug 2012 14:13:23 +0200 Subject: uprobes: Kill dup_mmap()->uprobe_mmap(), simplify uprobe_mmap/munmap 1. Kill dup_mmap()->uprobe_mmap(), it was only needed to calculate new_mm->uprobes_state.count removed by the previous patch. If the forking process has a pending uprobe (int3) in vma, it will be copied by copy_page_range(), note that it checks vma->anon_vma so "Don't copy ptes" is not possible after install_breakpoint() which does anon_vma_prepare(). 2. Remove is_swbp_at_addr() and "int count" in uprobe_mmap(). Again, this was needed for uprobes_state.count. As a side effect this fixes the bug pointed out by Srikar, this code lacked the necessary put_uprobe(). 3. uprobe_munmap() becomes a nop after the previous patch. Remove the meaningless code but do not remove the helper, we will need it. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 30 +++--------------------------- kernel/fork.c | 3 --- 2 files changed, 3 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f1664d217dc..ce59c100d65f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1010,7 +1010,7 @@ int uprobe_mmap(struct vm_area_struct *vma) struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret, count; + int ret; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) return 0; @@ -1023,8 +1023,6 @@ int uprobe_mmap(struct vm_area_struct *vma) build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); ret = 0; - count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!ret) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); @@ -1034,19 +1032,11 @@ int uprobe_mmap(struct vm_area_struct *vma) * We can race against uprobe_register(), see the * comment near uprobe_hash(). */ - if (ret == -EEXIST) { + if (ret == -EEXIST) ret = 0; - - if (!is_swbp_at_addr(vma->vm_mm, vaddr)) - continue; - } - - if (!ret) - count++; } put_uprobe(uprobe); } - mutex_unlock(uprobes_mmap_hash(inode)); return ret; @@ -1057,27 +1047,13 @@ int uprobe_mmap(struct vm_area_struct *vma) */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct list_head tmp_list; - struct uprobe *uprobe, *u; - struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - inode = vma->vm_file->f_mapping->host; - if (!inode) - return; - - mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, vma, start, end, &tmp_list); - - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - put_uprobe(uprobe); - } - mutex_unlock(uprobes_mmap_hash(inode)); + /* TODO: unmapping uprobe(s) will need more work */ } /* Slot allocation for XOL */ diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..912b6f6fe5b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,9 +454,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - - if (file) - uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); -- cgit v1.2.2 From 5e5be71ab3fd8bd2076606923791ece1634c199c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 6 Aug 2012 14:49:56 +0200 Subject: uprobes: Change uprobe_mmap() to ignore the errors but check fatal_signal_pending() Once install_breakpoint() fails uprobe_mmap() "ignores" all other uprobes and returns the error. It was never really needed to to stop after the first error, and in fact it was always wrong at least in -ENOTSUPP case. Change uprobe_mmap() to ignore the errors and always return 0. This is not what we want in the long term, but until we teach the callers to handle the failure it would be better to remove the pointless complications. And this doesn't look too bad, the only "reasonable" error is ENOMEM but in this case the caller should be oom-killed in the likely case or the system has more serious problems. However it makes sense to stop if fatal_signal_pending() == T. In particular this helps if the task was oom-killed. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce59c100d65f..298fbbdf57e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -994,23 +994,16 @@ static void build_probe_list(struct inode *inode, } /* - * Called from mmap_region. - * called with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * - * Return -ve no if we fail to insert probes and we cannot - * bail-out. - * Return 0 otherwise. i.e: - * - * - successful insertion of probes - * - (or) no possible probes to be inserted. - * - (or) insertion of probes failed but we can bail-out. + * Currently we ignore all errors and always return 0, the callers + * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) return 0; @@ -1022,24 +1015,16 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - ret = 0; list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!ret) { + if (!fatal_signal_pending(current)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - - ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - /* - * We can race against uprobe_register(), see the - * comment near uprobe_hash(). - */ - if (ret == -EEXIST) - ret = 0; + install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); - return ret; + return 0; } /* -- cgit v1.2.2 From 78f7411668aa0b2006d331f6a288416dd91b8e5d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 17:35:08 +0200 Subject: uprobes: Do not use -EEXIST in install_breakpoint() paths -EEXIST from install_breakpoint() no longer makes sense, all callers should simply treat it as "success". Change the code to return zero and simplify register_for_each_vma(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 298fbbdf57e6..3e2996b809be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -332,7 +332,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) - return -EEXIST; + return 0; if (result) return result; @@ -657,7 +657,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence behave as if probe already existed. */ if (!uprobe->consumers) - return -EEXIST; + return 0; if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma->vm_file); @@ -817,17 +817,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) { + if (is_register) err = install_breakpoint(uprobe, mm, vma, info->vaddr); - /* - * We can race against uprobe_mmap(), see the - * comment near uprobe_hash(). - */ - if (err == -EEXIST) - err = 0; - } else { + else remove_breakpoint(uprobe, mm, info->vaddr); - } + unlock: up_write(&mm->mmap_sem); free: -- cgit v1.2.2 From f8ac4ec9c064b330dcc49e03c450fe74298c4622 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 17:11:42 +0200 Subject: uprobes: Introduce MMF_HAS_UPROBES Add the new MMF_HAS_UPROBES flag. It is set by install_breakpoint() and it is copied by dup_mmap(), uprobe_pre_sstep_notifier() checks it to avoid the slow path if the task was never probed. Perhaps it makes sense to check it in valid_vma(is_register => false) as well. This needs the new dup_mmap()->uprobe_dup_mmap() hook. We can't use uprobe_reset_state() or put MMF_HAS_UPROBES into MMF_INIT_MASK, we need oldmm->mmap_sem to avoid the race with uprobe_register() or mmap() from another thread. Currently we never clear this bit, it can be false-positive after uprobe_unregister() or uprobe_munmap() or if dup_mmap() hits the probed VM_DONTCOPY vma. But this is fine correctness-wise and has no effect unless the task hits the non-uprobe breakpoint. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 22 +++++++++++++++++++++- kernel/fork.c | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3e2996b809be..33870b17e1dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -647,6 +647,7 @@ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) { + bool first_uprobe; int ret; /* @@ -678,7 +679,17 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, uprobe->flags |= UPROBE_COPY_INSN; } + /* + * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), + * the task can hit this breakpoint right after __replace_page(). + */ + first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + if (first_uprobe) + set_bit(MMF_HAS_UPROBES, &mm->flags); + ret = set_swbp(&uprobe->arch, mm, vaddr); + if (ret && first_uprobe) + clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } @@ -1032,6 +1043,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; + if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + return; + /* TODO: unmapping uprobe(s) will need more work */ } @@ -1142,6 +1156,12 @@ void uprobe_reset_state(struct mm_struct *mm) mm->uprobes_state.xol_area = NULL; } +void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) + set_bit(MMF_HAS_UPROBES, &newmm->flags); +} + /* * - search for a free slot. */ @@ -1507,7 +1527,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask; - if (!current->mm) + if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) return 0; utask = current->utask; diff --git a/kernel/fork.c b/kernel/fork.c index 912b6f6fe5b8..cbb5f9fcd3e8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -353,6 +353,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ -- cgit v1.2.2 From 61559a8165da2b6bab7621ac36379c6280efacb6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 17:17:46 +0200 Subject: uprobes: Fold uprobe_reset_state() into uprobe_dup_mmap() Now that we have uprobe_dup_mmap() we can fold uprobe_reset_state() into the new hook and remove it. mmput()->uprobe_clear_state() can't be called before dup_mmap(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 10 ++-------- kernel/fork.c | 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 33870b17e1dd..610e1c8050cf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1148,16 +1148,10 @@ void uprobe_clear_state(struct mm_struct *mm) kfree(area); } -/* - * uprobe_reset_state - Free the area allocated for slots. - */ -void uprobe_reset_state(struct mm_struct *mm) -{ - mm->uprobes_state.xol_area = NULL; -} - void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { + newmm->uprobes_state.xol_area = NULL; + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) set_bit(MMF_HAS_UPROBES, &newmm->flags); } diff --git a/kernel/fork.c b/kernel/fork.c index cbb5f9fcd3e8..2343c9eaaaf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -837,8 +837,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif - uprobe_reset_state(mm); - if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.2.2 From ded86e7c8fc4404414c4700010c9962ea8bd083a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 18:07:03 +0200 Subject: uprobes: Remove "verify" argument from set_orig_insn() Nobody does set_orig_insn(verify => false), and I think nobody will. Remove this argument. IIUC set_orig_insn(verify => false) was needed to single-step without xol area. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 610e1c8050cf..1666632e6edf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -345,24 +345,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned * @mm: the probed process address space. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. - * @verify: if true, verify existance of breakpoint instruction. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - if (verify) { - int result; + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (!result) + return -EINVAL; - result = is_swbp_at_addr(mm, vaddr); - if (!result) - return -EINVAL; + if (result != 1) + return result; - if (result != 1) - return result; - } return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } @@ -697,7 +695,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - set_orig_insn(&uprobe->arch, mm, vaddr, true); + set_orig_insn(&uprobe->arch, mm, vaddr); } /* -- cgit v1.2.2 From 77f827de07432a74821cf0f831d699544b2d474f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 6 Aug 2012 01:39:57 +0200 Subject: PM / Domains: Add power off/on function for system core suspend stage Introduce function pm_genpd_syscore_switch() and two wrappers around it, pm_genpd_syscore_poweroff() and pm_genpd_syscore_poweron(), allowing the callers to let the generic PM domains framework know that the given device is not necessary any more and its PM domain can be turned off (the former) or that the given device will be required immediately, so its PM domain has to be turned on (the latter) during the system core (syscore) stage of system suspend (or hibernation) and resume. These functions will be used for handling devices registered as clock sources and clock event devices that belong to PM domains. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a70518c9d82f..5dfdc9ea180b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS bool depends on PM +config PM_GENERIC_DOMAINS_SLEEP + def_bool y + depends on PM_SLEEP && PM_GENERIC_DOMAINS + config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS -- cgit v1.2.2 From adc78e6b9946a4b22e22403d961f3b03c469e5d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 6 Aug 2012 01:40:41 +0200 Subject: timekeeping: Add suspend and resume of clock event devices Some clock event devices, for example such that belong to PM domains, need to be handled in a spcial way during the timekeeping suspend and resume (which takes place in the system core, or "syscore", stages of system power transitions) in analogy with clock sources. Introduce .suspend() and .resume() callbacks for clock event devices that will be executed by timekeeping_suspend/_resume(), respectively, next the the clock sources' .suspend() and .resume() callbacks. Signed-off-by: Rafael J. Wysocki --- kernel/time/clockevents.c | 24 ++++++++++++++++++++++++ kernel/time/timekeeping.c | 2 ++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce012a851..30b6de0d977c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old, local_irq_restore(flags); } +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume) + dev->resume(dev); +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34e5eac81424..312a675cb240 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -773,6 +773,7 @@ static void timekeeping_resume(void) read_persistent_clock(&ts); + clockevents_resume(); clocksource_resume(); write_seqlock_irqsave(&tk->lock, flags); @@ -832,6 +833,7 @@ static int timekeeping_suspend(void) clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); + clockevents_suspend(); return 0; } -- cgit v1.2.2 From f319da0c6894fcf55e21320e40506418a2aad629 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Aug 2012 11:26:57 +0200 Subject: sched: Fix load avg vs cpu-hotplug Rabik and Paul reported two different issues related to the same few lines of code. Rabik's issue is that the nr_uninterruptible migration code is wrong in that he sees artifacts due to this (Rabik please do expand in more detail). Paul's issue is that this code as it stands relies on us using stop_machine() for unplug, we all would like to remove this assumption so that eventually we can remove this stop_machine() usage altogether. The only reason we'd have to migrate nr_uninterruptible is so that we could use for_each_online_cpu() loops in favour of for_each_possible_cpu() loops, however since nr_uninterruptible() is the only such loop and its using possible lets not bother at all. The problem Rabik sees is (probably) caused by the fact that by migrating nr_uninterruptible we screw rq->calc_load_active for both rqs involved. So don't bother with fancy migration schemes (meaning we now have to keep using for_each_possible_cpu()) and instead fold any nr_active delta after we migrate all tasks away to make sure we don't have any skewed nr_active accounting. Reported-by: Rakib Mullick Reported-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345454817.23018.27.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..207a81c769d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5304,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5618,8 +5608,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + calc_load_migrate(rq); break; #endif } -- cgit v1.2.2 From 749c8814f08f12baa4a9c2812a7c6ede7d69507d Mon Sep 17 00:00:00 2001 From: Charles Wang Date: Mon, 20 Aug 2012 16:02:33 +0800 Subject: sched: Add missing call to calc_load_exit_idle() Azat Khuzhin reported high loadavg in Linux v3.6 After checking the upstream scheduler code, I found Peter's commit: 5167e8d5417b sched/nohz: Rewrite and fix load-avg computation -- again not fully applied, missing the call to calc_load_exit_idle(). After that idle exit in sampling window will always be calculated to non-idle, and the load will be higher than normal. This patch adds the missing call to calc_load_exit_idle(). Signed-off-by: Charles Wang Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345449754-27130-1-git-send-email-muming.wq@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f74..3a9e5d5c1091 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); update_cpu_load_nohz(); + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.2 From a4c96ae319b8047f62dedbe1eac79e321c185749 Mon Sep 17 00:00:00 2001 From: Peter Boonstoppel Date: Thu, 9 Aug 2012 15:34:47 -0700 Subject: sched: Unthrottle rt runqueues in __disable_runtime() migrate_tasks() uses _pick_next_task_rt() to get tasks from the real-time runqueues to be migrated. When rt_rq is throttled _pick_next_task_rt() won't return anything, in which case migrate_tasks() can't move all threads over and gets stuck in an infinite loop. Instead unthrottle rt runqueues before migrating tasks. Additionally: move unthrottle_offline_cfs_rqs() to rq_offline_fair() Signed-off-by: Peter Boonstoppel Signed-off-by: Peter Zijlstra Cc: Paul Turner Link: http://lkml.kernel.org/r/5FBF8E85CA34454794F0F7ECBA79798F379D3648B7@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- kernel/sched/fair.c | 7 +++++-- kernel/sched/rt.c | 1 + kernel/sched/sched.h | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 207a81c769d4..a4ea245f3d85 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5342,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); - for ( ; ; ) { /* * There's this thread running, bail when that's the only diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704c..86ad83c45dae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -4956,6 +4956,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 944cb68420e9..e0b7ba9c040f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -691,6 +691,7 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e77..0848fa36c383 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -- cgit v1.2.2 From 9450d57eab5cad36774c297da123062744472588 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 17:45:08 -0700 Subject: sched: Fix kernel-doc warnings in kernel/sched/fair.c Fix two kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3660): Excess function parameter 'cpus' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3806): Excess function parameter 'cpus' description in 'update_sd_lb_stats' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/50303714.3090204@xenotime.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 86ad83c45dae..42d9df6a5ca4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3658,7 +3658,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ @@ -3805,7 +3804,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -- cgit v1.2.2 From c751134ef8b070070d5f06348286b29d86424677 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 16 Aug 2012 13:21:05 +0900 Subject: sched: Remove AFFINE_WAKEUPS feature flag Commit beac4c7e4a1c ("sched: Remove AFFINE_WAKEUPS feature") removed use of the flag but left the definition. Get rid of it. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1345090865-20851-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..c38f52ea53dd 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -11,14 +11,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) */ SCHED_FEAT(START_DEBIT, true) -/* - * Based on load and program behaviour, see if it makes sense to place - * a newly woken task on the same cpu as the task that woke it -- - * improve cache locality. Typically used with SYNC wakeups as - * generated by pipes and the like, see also SYNC_WAKEUPS. - */ -SCHED_FEAT(AFFINE_WAKEUPS, true) - /* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we -- cgit v1.2.2 From 201c373e8e4823700d3160d5c28e1ab18fd1193e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 16 Aug 2012 17:03:24 +0900 Subject: sched/debug: Limit sd->*_idx range on sysctl Various sd->*_idx's are used for refering the rq's load average table when selecting a cpu to run. However they can be set to any number with sysctl knobs so that it can crash the kernel if something bad is given. Fix it by limiting them into the actual range. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345104204-8317-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae66229238a0..ec0f2b81b81c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4896,16 +4896,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX; + static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler, + bool load_idx) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } } static struct ctl_table * @@ -4917,30 +4926,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return NULL; set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); + CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[12] is terminator */ return table; -- cgit v1.2.2 From d00535db42805e9ae5eadf1b4a86e01e85674b0c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 16 Aug 2012 11:15:30 +0900 Subject: sched: Add time unit suffix to sched sysctl knobs Unlike others, sched_migration_cost, sched_time_avg and sched_shares_window doesn't have time unit as suffix. Add them. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345083330-19486-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..81c7b1a1a307 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_migration_cost", + .procname = "sched_migration_cost_ns", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, @@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "sched_time_avg", + .procname = "sched_time_avg_ms", .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .procname = "sched_shares_window", + .procname = "sched_shares_window_ns", .data = &sysctl_sched_shares_window, .maxlen = sizeof(unsigned int), .mode = 0644, -- cgit v1.2.2 From 38b8dd6f87398524d02c21ff614c507ba8c9d295 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 3 Jul 2012 14:34:02 +0800 Subject: sched: Remove useless code in yield_to() It's impossible to enter the else branch if we have set skip_clock_update in task_yield_fair(), as yield_to_task_fair() will directly return true after invoke task_yield_fair(). Signed-off-by: Michael Wang Acked-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FF2925A.9060005@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec0f2b81b81c..c46a011ce5db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4348,13 +4348,6 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; } out: -- cgit v1.2.2 From a6fa941d94b411bbd2b6421ffbde6db3c93e65ab Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Aug 2012 14:59:25 +0100 Subject: perf_event: Switch to internal refcount, fix race with close() Don't mess with file refcounts (or keep a reference to file, for that matter) in perf_event. Use explicit refcount of its own instead. Deal with the race between the final reference to event going away and new children getting created for it by use of atomic_long_inc_not_zero() in inherit_event(); just have the latter free what it had allocated and return NULL, that works out just fine (children of siblings of something doomed are created as singletons, same as if the child of leader had been created and immediately killed). Signed-off-by: Al Viro Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820135925.GG23464@ZenIV.linux.org.uk Signed-off-by: Ingo Molnar --- kernel/events/core.c | 62 ++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..efef4282b8e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3227,7 +3233,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, NULL, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6844,14 +6856,6 @@ inherit_event(struct perf_event *parent_event, add_event_to_ctx(child_event, child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - /* * Link this into the parent event's child list */ -- cgit v1.2.2 From 500ad2d8b01390c98bc6dce068bccfa9534b8212 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Thu, 2 Aug 2012 13:46:35 +0530 Subject: perf/hwpb: Invoke __perf_event_disable() if interrupts are already disabled While debugging a warning message on PowerPC while using hardware breakpoints, it was discovered that when perf_event_disable is invoked through hw_breakpoint_handler function with interrupts disabled, a subsequent IPI in the code path would trigger a WARN_ON_ONCE message in smp_call_function_single function. This patch calls __perf_event_disable() when interrupts are already disabled, instead of perf_event_disable(). Reported-by: Edjunior Barbosa Machado Signed-off-by: K.Prasad [naveen.n.rao@linux.vnet.ibm.com: v3: Check to make sure we target current task] Signed-off-by: Naveen N. Rao Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120802081635.5811.17737.stgit@localhost.localdomain [ Fixed build error on MIPS. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index efef4282b8e8..7fee567153f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1253,7 +1253,7 @@ retry: /* * Cross CPU call to disable a performance event */ -static int __perf_event_disable(void *info) +int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee12..9a7b487c6fe2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att int old_type = bp->attr.bp_type; int err = 0; - perf_event_disable(bp); + /* + * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it + * will not be possible to raise IPIs that invoke __perf_event_disable. + * So call the function directly after making sure we are targeting the + * current task. + */ + if (irqs_disabled() && bp->ctx && bp->ctx->task == current) + __perf_event_disable(bp); + else + perf_event_disable(bp); bp->attr.bp_addr = attr->bp_addr; bp->attr.bp_type = attr->bp_type; -- cgit v1.2.2 From 96e65306b81351b656835c15931d1d237b252f27 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 2 Sep 2012 00:28:19 +0800 Subject: workqueue: UNBOUND -> REBIND morphing in rebind_workers() should be atomic The compiler may compile the following code into TWO write/modify instructions. worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; so the other CPU may temporarily see worker->flags which doesn't have either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup prematurely. Fix it by using single explicit assignment via ACCESS_ONCE(). Because idle workers have another WORKER_NOT_RUNNING flag, this bug doesn't exist for them; however, update it to use the same pattern for consistency. tj: Applied the change to idle workers too and updated comments and patch description a bit. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a10..c462cd60c374 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1396,12 +1396,15 @@ retry: /* set REBIND and kick idle ones, we'll wait for these later */ for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) { + unsigned long worker_flags = worker->flags; + if (worker->flags & WORKER_REBIND) continue; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; idle_rebind.cnt++; worker->idle_rebind = &idle_rebind; @@ -1434,10 +1437,12 @@ retry: /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) -- cgit v1.2.2 From 90beca5de591e12482a812f23a7f10690962ed4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:12:33 -0700 Subject: workqueue: move WORKER_REBIND clearing in rebind_workers() to the end of the function This doesn't make any functional difference and is purely to help the next patch to be simpler. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c462cd60c374..d79a18d0c42e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1422,19 +1422,7 @@ retry: goto retry; } - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags &= ~WORKER_REBIND; - - wake_up_all(&gcwq->rebind_hold); - - /* rebind busy workers */ + /* all idle workers are rebound, rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; unsigned long worker_flags = worker->flags; @@ -1454,6 +1442,18 @@ retry: worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); } static struct worker *alloc_worker(void) -- cgit v1.2.2 From ec58815ab0409a921a7c9744eb4ca44866b14d71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:16:32 -0700 Subject: workqueue: fix possible deadlock in idle worker rebinding Currently, rebind_workers() and idle_worker_rebind() are two-way interlocked. rebind_workers() waits for idle workers to finish rebinding and rebound idle workers wait for rebind_workers() to finish rebinding busy workers before proceeding. Unfortunately, this isn't enough. The second wait from idle workers is implemented as follows. wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); rebind_workers() clears WORKER_REBIND, wakes up the idle workers and then returns. If CPU hotplug cycle happens again before one of the idle workers finishes the above wait_event(), rebind_workers() will repeat the first part of the handshake - set WORKER_REBIND again and wait for the idle worker to finish rebinding - and this leads to deadlock because the idle worker would be waiting for WORKER_REBIND to clear. This is fixed by adding another interlocking step at the end - rebind_workers() now waits for all the idle workers to finish the above WORKER_REBIND wait before returning. This ensures that all rebinding steps are complete on all idle workers before the next hotplug cycle can happen. This problem was diagnosed by Lai Jiangshan who also posted a patch to fix the issue, upon which this patch is based. This is the minimal fix and further patches are scheduled for the next merge window to simplify the CPU hotplug path. Signed-off-by: Tejun Heo Original-patch-by: Lai Jiangshan LKML-Reference: <1346516916-1991-3-git-send-email-laijs@cn.fujitsu.com> --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d79a18d0c42e..dc7b8458e275 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1326,6 +1326,15 @@ static void idle_worker_rebind(struct worker *worker) /* we did our part, wait for rebind_workers() to finish up */ wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + + /* + * rebind_workers() shouldn't finish until all workers passed the + * above WORKER_REBIND wait. Tell it when done. + */ + spin_lock_irq(&worker->pool->gcwq->lock); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); } /* @@ -1448,12 +1457,28 @@ retry: * be cleared inside idle_worker_rebind(). Clear and release. * Clearing %WORKER_REBIND from this foreign context is safe * because these workers are still guaranteed to be idle. + * + * We need to make sure all idle workers passed WORKER_REBIND wait + * in idle_worker_rebind() before returning; otherwise, workers can + * get stuck at the wait if hotplug cycle repeats. */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { worker->flags &= ~WORKER_REBIND; + idle_rebind.cnt++; + } + } wake_up_all(&gcwq->rebind_hold); + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + } } static struct worker *alloc_worker(void) -- cgit v1.2.2 From 65f8c95e46a1827ae8bbc52a817ea308dd7d65ae Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Jul 2012 14:26:15 -0700 Subject: pstore/ftrace: Convert to its own enable/disable debugfs knob With this patch we no longer reuse function tracer infrastructure, now we register our own tracer back-end via a debugfs knob. It's a bit more code, but that is the only downside. On the bright side we have: - Ability to make persistent_ram module removable (when needed, we can move ftrace_ops struct into a module). Note that persistent_ram is still not removable for other reasons, but with this patch it's just one thing less to worry about; - Pstore part is more isolated from the generic function tracer. We tried it already by registering our own tracer in available_tracers, but that way we're loosing ability to see the traces while we record them to pstore. This solution is somewhere in the middle: we only register "internal ftracer" back-end, but not the "front-end"; - When there is only pstore tracing enabled, the kernel will only write to the pstore buffer, omitting function tracer buffer (which, of course, still can be enabled via 'echo function > current_tracer'). Suggested-by: Steven Rostedt Signed-off-by: Anton Vorontsov --- kernel/trace/trace_functions.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a426f410c060..0ad83e3929d1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include "trace.h" @@ -75,10 +74,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -/* Our two options */ +/* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, - TRACE_FUNC_OPT_PSTORE = 0x2, }; static struct tracer_flags func_flags; @@ -106,12 +104,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - /* - * So far tracing doesn't support multiple buffers, so - * we make an explicit call for now. - */ - if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) - pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -176,9 +168,6 @@ static struct ftrace_ops trace_stack_ops __read_mostly = static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, -#endif -#ifdef CONFIG_PSTORE_FTRACE - { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, #endif { } /* Always set a last empty entry */ }; @@ -231,8 +220,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - break; - case TRACE_FUNC_OPT_PSTORE: break; default: return -EINVAL; -- cgit v1.2.2 From c6a57bfffea5b673e5b4f9aeff85a00607e59077 Mon Sep 17 00:00:00 2001 From: Luis Gonzalez Fernandez Date: Fri, 7 Sep 2012 21:35:21 +0200 Subject: PM / QoS: Add return code to pm_qos_get_value function. pm_qos_get_value don't return a return code in all cases. It's sure that anything interesting happend after BUG() but this prevent any compilation warning. [rjw: Chaneged the new return value to PM_QOS_DEFAULT_VALUE.] Signed-off-by: Luis Gonzalez Fernandez Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e684026..846bd42c7ed1 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) default: /* runtime check for not using enum */ BUG(); + return PM_QOS_DEFAULT_VALUE; } } -- cgit v1.2.2 From 9f00d9776bc5beb92e8bfc884a7e96ddc5589e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 8 Sep 2012 02:53:54 +0000 Subject: netlink: hide struct module parameter in netlink_kernel_create This patch defines netlink_kernel_create as a wrapper function of __netlink_kernel_create to hide the struct module *me parameter (which seems to be THIS_MODULE in all existing netlink subsystems). Suggested by David S. Miller. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- kernel/audit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..a24aafa850ae 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -971,8 +971,7 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, - THIS_MODULE, &cfg); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else -- cgit v1.2.2 From 552a37e9360a293cd20e7f8ff1fb326a244c5f1e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:33 -0700 Subject: workqueue: restore POOL_MANAGING_WORKERS This patch restores POOL_MANAGING_WORKERS which was replaced by pool->manager_mutex by 6037315269 "workqueue: use mutex for global_cwq manager exclusion". There's a subtle idle worker depletion bug across CPU hotplug events and we need to distinguish an actual manager and CPU hotplug preventing management. POOL_MANAGING_WORKERS will be used for the former and manager_mutex the later. This patch just lays POOL_MANAGING_WORKERS on top of the existing manager_mutex and doesn't introduce any synchronization changes. The next patch will update it. Note that this patch fixes a non-critical anomaly where too_many_workers() may return %true spuriously while CPU hotplug is in progress. While the issue could schedule idle timer spuriously, it didn't trigger any actual misbehavior. tj: Rewrote patch description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dc7b8458e275..383548ed0b54 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,6 +66,7 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_mutex); + bool managing = pool->flags & POOL_MANAGING_WORKERS; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1827,6 +1828,7 @@ static bool manage_workers(struct worker *worker) if (!mutex_trylock(&pool->manager_mutex)) return ret; + pool->flags |= POOL_MANAGING_WORKERS; pool->flags &= ~POOL_MANAGE_WORKERS; /* @@ -1836,6 +1838,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); + pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->manager_mutex); return ret; } -- cgit v1.2.2 From ee378aa49b594da9bda6a2c768cc5b2ad585f911 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:44 -0700 Subject: workqueue: fix possible idle worker depletion across CPU hotplug To simplify both normal and CPU hotplug paths, worker management is prevented while CPU hoplug is in progress. This is achieved by CPU hotplug holding the same exclusion mechanism used by workers to ensure there's only one manager per pool. If someone else seems to be performing the manager role, workers proceed to execute work items. CPU hotplug using the same mechanism can lead to idle worker depletion because all workers could proceed to execute work items while CPU hotplug is in progress and CPU hotplug itself wouldn't actually perform the worker management duty - it doesn't guarantee that there's an idle worker left when it releases management. This idle worker depletion, under extreme circumstances, can break forward-progress guarantee and thus lead to deadlock. This patch fixes the bug by using separate mechanisms for manager exclusion among workers and hotplug exclusion. For manager exclusion, POOL_MANAGING_WORKERS which was restored by the previous patch is used. pool->manager_mutex is now only used for exclusion between the elected manager and CPU hotplug. The elected manager won't proceed without holding pool->manager_mutex. This ensures that the worker which won the manager position can't skip managing while CPU hotplug is in progress. It will block on manager_mutex and perform management after CPU hotplug is complete. Note that hotplug may happen while waiting for manager_mutex. A manager isn't either on idle or busy list and thus the hoplug code can't unbind/rebind it. Make the manager handle its own un/rebinding. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 383548ed0b54..1e1373bcb3e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1825,10 +1825,45 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (!mutex_trylock(&pool->manager_mutex)) + if (pool->flags & POOL_MANAGING_WORKERS) return ret; pool->flags |= POOL_MANAGING_WORKERS; + + /* + * To simplify both worker management and CPU hotplug, hold off + * management while hotplug is in progress. CPU hotplug path can't + * grab %POOL_MANAGING_WORKERS to achieve this because that can + * lead to idle worker depletion (all become busy thinking someone + * else is managing) which in turn can result in deadlock under + * extreme circumstances. Use @pool->manager_mutex to synchronize + * manager against CPU hotplug. + * + * manager_mutex would always be free unless CPU hotplug is in + * progress. trylock first without dropping @gcwq->lock. + */ + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + spin_unlock_irq(&pool->gcwq->lock); + mutex_lock(&pool->manager_mutex); + /* + * CPU hotplug could have happened while we were waiting + * for manager_mutex. Hotplug itself can't handle us + * because manager isn't either on idle or busy list, and + * @gcwq's state and ours could have deviated. + * + * As hotplug is now excluded via manager_mutex, we can + * simply try to bind. It will succeed or fail depending + * on @gcwq's current state. Try it and adjust + * %WORKER_UNBOUND accordingly. + */ + if (worker_maybe_bind_and_lock(worker)) + worker->flags &= ~WORKER_UNBOUND; + else + worker->flags |= WORKER_UNBOUND; + + ret = true; + } + pool->flags &= ~POOL_MANAGE_WORKERS; /* -- cgit v1.2.2 From bbdc18a3fb6740619f0d037241c85dc6cd4517aa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 10 Sep 2012 12:05:18 +0000 Subject: properly __init-annotate pm_sysrq_init() This is used only as argument to subsys_initcall(). Signed-off-by: Jan Beulich Signed-off-by: Rafael J. Wysocki --- kernel/power/poweroff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index d52359374e85..68197a4e8fc9 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { .enable_mask = SYSRQ_ENABLE_BOOT, }; -static int pm_sysrq_init(void) +static int __init pm_sysrq_init(void) { register_sysrq_key('o', &sysrq_poweroff_op); return 0; -- cgit v1.2.2 From 15e473046cb6e5d18a4d0057e61d76315230382b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Sep 2012 20:12:54 +0000 Subject: netlink: Rename pid to portid to avoid confusion It is a frequent mistake to confuse the netlink port identifier with a process identifier. Try to reduce this confusion by renaming fields that hold port identifiers portid instead of pid. I have carefully avoided changing the structures exported to userspace to avoid changing the userspace API. I have successfully built an allyesconfig kernel with this change. Signed-off-by: "Eric W. Biederman" Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- kernel/audit.c | 20 ++++++++++---------- kernel/taskstats.c | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a24aafa850ae..e0cf64a0ae2d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -87,11 +87,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK; /* * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. + * contains the pid of the auditd process and audit_nlk_portid contains + * the portid to use to send netlink messages to that process. */ int audit_pid; -static int audit_nlk_pid; +static int audit_nlk_portid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -401,7 +401,7 @@ static void kauditd_send_skb(struct sk_buff *skb) int err; /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -692,7 +692,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_set.backlog_limit = audit_backlog_limit; status_set.lost = atomic_read(&audit_lost); status_set.backlog = skb_queue_len(&audit_skb_queue); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, &status_set, sizeof(status_set)); break; case AUDIT_SET: @@ -720,7 +720,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid, sid, 1); audit_pid = new_pid; - audit_nlk_pid = NETLINK_CB(skb).pid; + audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, @@ -782,7 +782,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -801,7 +801,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -872,7 +872,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; @@ -891,7 +891,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) rcu_read_unlock(); if (!err) - audit_send_reply(NETLINK_CB(skb).pid, seq, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..123793cd06f9 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -467,7 +467,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, REGISTER); + rc = add_del_listener(info->snd_portid, mask, REGISTER); out: free_cpumask_var(mask); return rc; @@ -483,7 +483,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); + rc = add_del_listener(info->snd_portid, mask, DEREGISTER); out: free_cpumask_var(mask); return rc; -- cgit v1.2.2 From e23eb920b0f3978687c497de2ac3eb9e281dab32 Mon Sep 17 00:00:00 2001 From: Peter Moody Date: Thu, 14 Jun 2012 10:04:35 -0700 Subject: audit: export audit_log_task_info At the suggestion of eparis@redhat.com, move this chunk of task logging from audit_log_exit to audit_log_task_info and export this function so it's usuable elsewhere in the kernel. This patch is against git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity#next-ima-appraisal Changelog v2: - add empty audit_log_task_info if CONFIG_AUDITSYSCALL isn't set. Changelog v1: - Initial post. Signed-off-by: Peter Moody Signed-off-by: Mimi Zohar --- kernel/auditsc.c | 74 ++++++++++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b8..37f52f27828d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1154,13 +1154,38 @@ error_path: EXPORT_SYMBOL(audit_log_task_context); -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { + const struct cred *cred; char name[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; struct vm_area_struct *vma; + char *tty; + + if (!ab) + return; /* tsk == current */ + cred = current_cred(); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + + audit_log_format(ab, + " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + sys_getppid(), + tsk->pid, + tsk->loginuid, cred->uid, cred->gid, + cred->euid, cred->suid, cred->fsuid, + cred->egid, cred->sgid, cred->fsgid, + tsk->sessionid, tty); get_task_comm(name, tsk); audit_log_format(ab, " comm="); @@ -1183,6 +1208,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk audit_log_task_context(ab); } +EXPORT_SYMBOL(audit_log_task_info); + static int audit_log_pid_context(struct audit_context *context, pid_t pid, uid_t auid, uid_t uid, unsigned int sessionid, u32 sid, char *comm) @@ -1585,26 +1612,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { - const struct cred *cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; - const char *tty; struct audit_names *n; /* tsk == current */ - context->pid = tsk->pid; - if (!context->ppid) - context->ppid = sys_getppid(); - cred = current_cred(); - context->uid = cred->uid; - context->gid = cred->gid; - context->euid = cred->euid; - context->suid = cred->suid; - context->fsuid = cred->fsuid; - context->egid = cred->egid; - context->sgid = cred->sgid; - context->fsgid = cred->fsgid; context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); @@ -1619,32 +1632,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - audit_log_format(ab, - " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " ppid=%d pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - context->argv[0], - context->argv[1], - context->argv[2], - context->argv[3], - context->name_count, - context->ppid, - context->pid, - tsk->loginuid, - context->uid, - context->gid, - context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid, tty, - tsk->sessionid); - + " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", + context->argv[0], + context->argv[1], + context->argv[2], + context->argv[3], + context->name_count); audit_log_task_info(ab, tsk); audit_log_key(ab, context->filterkey); -- cgit v1.2.2 From ac3d0da8f3290b3d394cdb7f50604424a7cd6092 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 26 Aug 2012 21:12:09 +0200 Subject: task_work: Make task_work_add() lockless Change task_work's to use llist-like code to avoid pi_lock in task_work_add(), this makes it useable under rq->lock. task_work_cancel() and task_work_run() still use pi_lock to synchronize with each other. (This is in preparation for a deadlock fix.) Suggested-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120826191209.GA4221@redhat.com Signed-off-by: Ingo Molnar --- kernel/task_work.c | 95 +++++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index d320d44903bd..f13ec0bda1d5 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -3,25 +3,18 @@ #include int -task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { - struct callback_head *last, *first; - unsigned long flags; - + struct callback_head *head; /* * Not inserting the new work if the task has already passed * exit_task_work() is the responisbility of callers. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); - last = task->task_works; - first = last ? last->next : twork; - twork->next = first; - if (last) - last->next = twork; - task->task_works = twork; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + do { + head = ACCESS_ONCE(task->task_works); + work->next = head; + } while (cmpxchg(&task->task_works, head, work) != head); - /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ if (notify) set_notify_resume(task); return 0; @@ -30,52 +23,60 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { + struct callback_head **pprev = &task->task_works; + struct callback_head *work = NULL; unsigned long flags; - struct callback_head *last, *res = NULL; - + /* + * If cmpxchg() fails we continue without updating pprev. + * Either we raced with task_work_add() which added the + * new entry before this work, we will find it again. Or + * we raced with task_work_run(), *pprev == NULL. + */ raw_spin_lock_irqsave(&task->pi_lock, flags); - last = task->task_works; - if (last) { - struct callback_head *q = last, *p = q->next; - while (1) { - if (p->func == func) { - q->next = p->next; - if (p == last) - task->task_works = q == p ? NULL : q; - res = p; - break; - } - if (p == last) - break; - q = p; - p = q->next; - } + while ((work = ACCESS_ONCE(*pprev))) { + read_barrier_depends(); + if (work->func != func) + pprev = &work->next; + else if (cmpxchg(pprev, work, work->next) == work) + break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return res; + + return work; } void task_work_run(void) { struct task_struct *task = current; - struct callback_head *p, *q; + struct callback_head *work, *head, *next; - while (1) { - raw_spin_lock_irq(&task->pi_lock); - p = task->task_works; - task->task_works = NULL; - raw_spin_unlock_irq(&task->pi_lock); + for (;;) { + work = xchg(&task->task_works, NULL); + if (!work) + break; + /* + * Synchronize with task_work_cancel(). It can't remove + * the first entry == work, cmpxchg(task_works) should + * fail, but it can play with *work and other entries. + */ + raw_spin_unlock_wait(&task->pi_lock); + smp_mb(); - if (unlikely(!p)) - return; + /* Reverse the list to run the works in fifo order */ + head = NULL; + do { + next = work->next; + work->next = head; + head = work; + work = next; + } while (work); - q = p->next; /* head */ - p->next = NULL; /* cut it */ - while (q) { - p = q->next; - q->func(q); - q = p; + work = head; + do { + next = work->next; + work->func(work); + work = next; cond_resched(); - } + } while (work); } } -- cgit v1.2.2 From 9da33de62431c7839f98156720862262272a8380 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 26 Aug 2012 21:12:11 +0200 Subject: task_work: task_work_add() should not succeed after exit_task_work() ed3e694d "move exit_task_work() past exit_files() et.al" destroyed the add/exit synchronization we had, the caller itself should ensure task_work_add() can't race with the exiting task. However, this is not convenient/simple, and the only user which tries to do this is buggy (see the next patch). Unless the task is current, there is simply no way to do this in general. Change exit_task_work()->task_work_run() to use the dummy "work_exited" entry to let task_work_add() know it should fail. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120826191211.GA4228@redhat.com Signed-off-by: Ingo Molnar --- kernel/task_work.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index f13ec0bda1d5..65bd3c92d6f3 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -2,16 +2,17 @@ #include #include +static struct callback_head work_exited; /* all we need is ->next == NULL */ + int task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { struct callback_head *head; - /* - * Not inserting the new work if the task has already passed - * exit_task_work() is the responisbility of callers. - */ + do { head = ACCESS_ONCE(task->task_works); + if (unlikely(head == &work_exited)) + return -ESRCH; work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); @@ -30,7 +31,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or - * we raced with task_work_run(), *pprev == NULL. + * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); while ((work = ACCESS_ONCE(*pprev))) { @@ -51,7 +52,16 @@ void task_work_run(void) struct callback_head *work, *head, *next; for (;;) { - work = xchg(&task->task_works, NULL); + /* + * work->func() can do task_work_add(), do not set + * work_exited unless the list is empty. + */ + do { + work = ACCESS_ONCE(task->task_works); + head = !work && (task->flags & PF_EXITING) ? + &work_exited : NULL; + } while (cmpxchg(&task->task_works, work, head) != work); + if (!work) break; /* -- cgit v1.2.2 From f784e8a7989c0da3062d04bfea3db90f41e8f738 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 26 Aug 2012 21:12:17 +0200 Subject: task_work: Simplify the usage in ptrace_notify() and get_signal_to_deliver() ptrace_notify() and get_signal_to_deliver() do unnecessary things before task_work_run(): 1. smp_mb__after_clear_bit() is not needed, test_and_clear_bit() implies mb(). 2. And we do not need the barrier at all, in this case we only care about the "synchronous" works added by the task itself. 3. No need to clear TIF_NOTIFY_RESUME, and we should not assume task_works is the only user of this flag. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120826191217.GA4238@redhat.com Signed-off-by: Ingo Molnar --- kernel/signal.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index be4f856d52f8..2c681f11b7d2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1971,13 +1971,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why) void ptrace_notify(int exit_code) { BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - if (unlikely(current->task_works)) { - if (test_and_clear_ti_thread_flag(current_thread_info(), - TIF_NOTIFY_RESUME)) { - smp_mb__after_clear_bit(); - task_work_run(); - } - } + if (unlikely(current->task_works)) + task_work_run(); spin_lock_irq(¤t->sighand->siglock); ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); @@ -2198,13 +2193,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) { - if (test_and_clear_ti_thread_flag(current_thread_info(), - TIF_NOTIFY_RESUME)) { - smp_mb__after_clear_bit(); - task_work_run(); - } - } + if (unlikely(current->task_works)) + task_work_run(); if (unlikely(uprobe_deny_signal())) return 0; -- cgit v1.2.2 From 5ed4f1d96deee82ee92cd1ac1e0108c27e80e9b0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 13 Sep 2012 06:11:26 +0200 Subject: sched: Fix nohz_idle_balance() On tickless systems, one CPU runs load balance for all idle CPUs. The cpu_load of this CPU is updated before starting the load balance of each other idle CPUs. We should instead update the cpu_load of the balance_cpu. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Venkatesh Pallipadi Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1347509486-8688-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1ca4fe423528..9ae3a5b68ba4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4794,14 +4794,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) if (need_resched()) break; - raw_spin_lock_irq(&this_rq->lock); - update_rq_clock(this_rq); - update_idle_cpu_load(this_rq); - raw_spin_unlock_irq(&this_rq->lock); + rq = cpu_rq(balance_cpu); + + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); rebalance_domains(balance_cpu, CPU_IDLE); - rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } -- cgit v1.2.2 From f3e947867478af9a12b9956bcd000ac7613a8a95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Sep 2012 11:22:00 +0200 Subject: sched: Remove __ARCH_WANT_INTERRUPTS_ON_CTXSW Now that the last architecture to use this has stopped doing so (ARM, thanks Catalin!) we can remove this complexity from the scheduler core. Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Catalin Marinas Link: http://lkml.kernel.org/n/tip-g9p2a1w81xxbrze25v9zpzbf@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ---- kernel/sched/core.c | 40 +--------------------------------------- kernel/sched/rt.c | 5 ----- kernel/sched/sched.h | 6 ------ 4 files changed, 1 insertion(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..743d48f4d711 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1280,11 +1280,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - p->hardirqs_enabled = 1; -#else p->hardirqs_enabled = 0; -#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c46a011ce5db..8b51b2d9b1fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1361,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) smp_send_reschedule(cpu); } -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -static int ttwu_activate_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; - - rq = __task_rq_lock(p); - if (p->on_cpu) { - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); - - return ret; - -} -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1440,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ - while (p->on_cpu) { -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - /* - * In case the architecture enables interrupts in - * context_switch(), we cannot busy wait, since that - * would lead to deadlocks when an interrupt hits and - * tries to wake up @prev. So bail and do a complete - * remote wakeup. - */ - if (ttwu_activate_remote(p, wake_flags)) - goto stat; -#else + while (p->on_cpu) cpu_relax(); -#endif - } /* * Pairs with the smp_wmb() in finish_lock_switch(). */ @@ -1798,13 +1766,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; account_switch_vtime(prev); finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - if (unlikely(task_running(rq, next_task))) - return 0; -#endif - retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 09871698e80c..7a7db09cfabc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) */ next->on_cpu = 1; #endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else raw_spin_unlock(&rq->lock); -#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) @@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->on_cpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); -#endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ -- cgit v1.2.2 From 08bedae1d0acd8c9baf514fb69fa199d0c8345f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Sep 2012 00:03:50 +0200 Subject: sched: Fix load avg vs. cpu-hotplug Commit f319da0c68 ("sched: Fix load avg vs cpu-hotplug") was an incomplete fix: In particular, the problem is that at the point it calls calc_load_migrate() nr_running := 1 (the stopper thread), so move the call to CPU_DEAD where we're sure that nr_running := 0. Also note that we can call calc_load_migrate() without serialization, we know the state of rq is stable since its cpu is dead, and we modify the global state using appropriate atomic ops. Suggested-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1346882630.2600.59.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b51b2d9b1fd..ba144b121f3d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5048,7 +5048,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; + case CPU_DEAD: calc_load_migrate(rq); break; #endif -- cgit v1.2.2 From c1cc017c59c44d9ede7003631c43adc0cfdce2f9 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 10 Sep 2012 15:10:58 +0800 Subject: sched/nohz: Clean up select_nohz_load_balancer() There is no load_balancer to be selected now. It just sets the state of the nohz tick to stop. So rename the function, pass the 'cpu' as a parameter and then remove the useless call from tick_nohz_restart_sched_tick(). [ s/set_nohz_tick_stopped/nohz_balance_enter_idle/g s/clear_nohz_tick_stopped/nohz_balance_exit_idle/g ] Signed-off-by: Alex Shi Acked-by: Suresh Siddha Cc: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1347261059-24747-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 25 ++++++++++--------------- kernel/time/tick-sched.c | 3 +-- 2 files changed, 11 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ae3a5b68ba4..de596a2f626c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4603,7 +4603,7 @@ static void nohz_balancer_kick(int cpu) return; } -static inline void clear_nohz_tick_stopped(int cpu) +static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); @@ -4643,28 +4643,23 @@ void set_cpu_sd_state_idle(void) } /* - * This routine will record that this cpu is going idle with tick stopped. + * This routine will record that the cpu is going idle with tick stopped. * This info will be used in performing idle load balancing in the future. */ -void select_nohz_load_balancer(int stop_tick) +void nohz_balance_enter_idle(int cpu) { - int cpu = smp_processor_id(); - /* * If this cpu is going down, then nothing needs to be done. */ if (!cpu_active(cpu)) return; - if (stop_tick) { - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + return; - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } - return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, @@ -4672,7 +4667,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DYING: - clear_nohz_tick_stopped(smp_processor_id()); + nohz_balance_exit_idle(smp_processor_id()); return NOTIFY_OK; default: return NOTIFY_DONE; @@ -4833,7 +4828,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - clear_nohz_tick_stopped(cpu); + nohz_balance_exit_idle(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3a9e5d5c1091..1a5ee90eea33 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - select_nohz_load_balancer(1); + nohz_balance_enter_idle(cpu); calc_load_enter_idle(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -569,7 +569,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ - select_nohz_load_balancer(0); tick_do_update_jiffies64(now); update_cpu_load_nohz(); -- cgit v1.2.2 From bc2a27cd27271c5257989a57f511be86b26f5e54 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 9 Jul 2012 11:27:06 +0200 Subject: sched: cpu_power: enable ARCH_POWER Heteregeneous ARM platform uses arch_scale_freq_power function to reflect the relative capacity of each core Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341826026-6504-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index c38f52ea53dd..eebefcad7027 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -34,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(ARCH_POWER, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -- cgit v1.2.2 From d094595078d00b63839d0c5ccb8b184ef242cb45 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 13 Sep 2012 11:39:51 +0200 Subject: lockdep: Check if nested lock is actually held It is considered good form to lock the lock you claim to be nested in. Signed-off-by: Maarten Lankhorst [ removed nest_lock arg to print_lock_nested_lock_not_held in favour of hlock->nest_lock, also renamed the lock arg to hlock since its a held_lock type ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5051A9E7.5040501@canonical.com Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ea9ee4518c35..7981e5b2350d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +static int +print_lock_nested_lock_not_held(struct task_struct *curr, + struct held_lock *hlock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("==================================\n"); + printk("[ BUG: Nested lock was not taken ]\n"); + print_kernel_ident(); + printk("----------------------------------\n"); + + printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + printk("\nbut this task is not holding:\n"); + printk("%s\n", hlock->nest_lock->name); + + printk("\nstack backtrace:\n"); + dump_stack(); + + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int __lock_is_held(struct lockdep_map *lock); + /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: @@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, id); + if (nest_lock && !__lock_is_held(nest_lock)) + return print_lock_nested_lock_not_held(curr, hlock, ip); + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; -- cgit v1.2.2 From ec145babe754f9ea1079034a108104b6001e001c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 19:26:03 -0400 Subject: time: Fix timeekeping_get_ns overflow on 32bit systems Daniel Lezcano reported seeing multi-second stalls from keyboard input on his T61 laptop when NOHZ and CPU_IDLE were enabled on a 32bit kernel. He bisected the problem down to commit 1e75fa8be9fb6 ("time: Condense timekeeper.xtime into xtime_sec"). After reproducing this issue, I narrowed the problem down to the fact that timekeeping_get_ns() returns a 64bit nsec value that hasn't been accumulated. In some cases this value was being then stored in timespec.tv_nsec (which is a long). On 32bit systems, with idle times larger then 4 seconds (or less, depending on the value of xtime_nsec), the returned nsec value would overflow 32bits. This limited kept time from increasing, causing timers to not expire. The fix is to make sure we don't directly store the result of timekeeping_get_ns() into a tv_nsec field, instead using a 64bit nsec value which can then be added into the timespec via timespec_add_ns(). Reported-and-bisected-by: Daniel Lezcano Tested-by: Daniel Lezcano Signed-off-by: John Stultz Acked-by: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1347405963-35715-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34e5eac81424..d3b91e75cecd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -303,10 +303,11 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(tk); } while (read_seqretry(&tk->lock, seq)); + ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getnstimeofday); @@ -345,6 +346,7 @@ void ktime_get_ts(struct timespec *ts) { struct timekeeper *tk = &timekeeper; struct timespec tomono; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); @@ -352,13 +354,14 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -1244,6 +1247,7 @@ void get_monotonic_boottime(struct timespec *ts) { struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); @@ -1251,14 +1255,15 @@ void get_monotonic_boottime(struct timespec *ts) do { seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); + ts->tv_sec += tomono.tv_sec + sleep.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); -- cgit v1.2.2 From 4fe84fb8c6b5081f7364af63aee8e118a665b966 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 10 Sep 2012 13:01:16 +0100 Subject: locking: Adjust spin lock inlining Kconfig options Break out the DEBUG_SPINLOCK dependency (requires moving up UNINLINE_SPIN_UNLOCK, as this was the only one in that block not depending on that option). Avoid putting values not selected into the resulting .config - they are not useful for anything, make the output less legible, and just consume space: Use "depends on" rather than directly setting the default from the combined dependency values. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/504DF2AC020000780009A2DF@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 103 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 2251882daf53..44511d100eaa 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE bool +config UNINLINE_SPIN_UNLOCK + bool + # # lock_* functions are inlined when: # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y @@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # +if !DEBUG_SPINLOCK + config INLINE_SPIN_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + def_bool y + depends on ARCH_INLINE_SPIN_TRYLOCK config INLINE_SPIN_TRYLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + def_bool y + depends on ARCH_INLINE_SPIN_TRYLOCK_BH config INLINE_SPIN_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK config INLINE_SPIN_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH config INLINE_SPIN_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ config INLINE_SPIN_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_IRQSAVE - -config UNINLINE_SPIN_UNLOCK - bool + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE config INLINE_SPIN_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE config INLINE_READ_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + def_bool y + depends on ARCH_INLINE_READ_TRYLOCK config INLINE_READ_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK config INLINE_READ_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH config INLINE_READ_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ config INLINE_READ_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_IRQSAVE + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + def_bool y + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE config INLINE_WRITE_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + def_bool y + depends on ARCH_INLINE_WRITE_TRYLOCK config INLINE_WRITE_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK config INLINE_WRITE_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH config INLINE_WRITE_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ config INLINE_WRITE_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_IRQSAVE + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + def_bool y + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +endif config MUTEX_SPIN_ON_OWNER - def_bool SMP && !DEBUG_MUTEXES + def_bool y + depends on SMP && !DEBUG_MUTEXES -- cgit v1.2.2 From 76bab1b78ab6f25d5f74165f94526c25fc93d984 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Mon, 27 Aug 2012 15:13:45 +0800 Subject: tracing: Skip printing "OK" if failed to disable event No acutal case found. But logically, we should skip "OK" in case any error met. Link: http://lkml.kernel.org/r/1346051625-25231-1-git-send-email-yuanhan.liu@linux.intel.com Signed-off-by: Yuanhan Liu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6825d833a257..bbb0e63d78e9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1646,9 +1646,11 @@ static __init void event_trace_self_tests(void) event_test_stuff(); ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); - if (WARN_ON_ONCE(ret)) + if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); + continue; + } pr_cont("OK\n"); } -- cgit v1.2.2 From ea632e9f12033346cc68247faa3b924d54936b8b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 19:45:14 -0700 Subject: trace: Stop compiling in trace_clock unconditionally Commit 56449f437 "tracing: make the trace clocks available generally", in April 2009, made trace_clock available unconditionally, since CONFIG_X86_DS used it too. Commit faa4602e47 "x86, perf, bts, mm: Delete the never used BTS-ptrace code", in March 2010, removed CONFIG_X86_DS, and now only CONFIG_RING_BUFFER (split out from CONFIG_TRACING for general use) has a dependency on trace_clock. So, only compile in trace_clock with CONFIG_RING_BUFFER or CONFIG_TRACING enabled. Link: http://lkml.kernel.org/r/20120903024513.GA19583@leaf Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: "Eric W. Biederman" Cc: Al Viro Signed-off-by: Josh Triplett Signed-off-by: Steven Rostedt --- kernel/Makefile | 2 +- kernel/trace/Kconfig | 5 +++++ kernel/trace/Makefile | 6 +----- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..29d993be7dba 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -98,7 +98,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ -obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9301a0e35e0c..4cea4f41c1d9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -62,8 +62,12 @@ config HAVE_C_RECORDMCOUNT config TRACER_MAX_TRACE bool +config TRACE_CLOCK + bool + config RING_BUFFER bool + select TRACE_CLOCK config FTRACE_NMI_ENTER bool @@ -114,6 +118,7 @@ config TRACING select NOP_TRACER select BINARY_PRINTF select EVENT_TRACING + select TRACE_CLOCK config GENERIC_TRACER bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 837090808aac..d7e2068e4b71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,11 +19,7 @@ endif CFLAGS_trace_events_filter.o := -I$(src) -# -# Make the trace clocks available generally: it's infrastructure -# relied on by ptrace for example: -# -obj-y += trace_clock.o +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o -- cgit v1.2.2 From c6aaf4d0bb86e2154ea31a33804cec300611255f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Sep 2012 23:31:25 +0900 Subject: kprobes/x86: Fix to support jprobes on ftrace-based kprobe Fix kprobes/x86 to support jprobes on ftrace-based kprobes. Because of -mfentry support of ftrace, ftrace is now put on the beginning of function where jprobes are put. Originally ftrace-based kprobes doesn't support jprobe because it will change regs->ip and ftrace doesn't support changing IP and ftrace itself doesn't conflict jprobe. However, ftrace -mfentry support moves mcount call on the top of functions where jprobes are put. This means that jprobe always conflicts with ftrace-based kprobe and fails. This patch allows ftrace-based kprobes to support jprobes by allowing to modify regs->ip and kprobes breakpoint handler also allows to skip singlestepping because there is a ftrace call (not an original instruction). Link: http://lkml.kernel.org/r/20120905143125.10329.90836.stgit@localhost.localdomain Reported-by: Fengguang Wu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 35b4315d84f5..098f396aa409 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1418,9 +1418,6 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; - /* break_handler (jprobe) can not work with ftrace */ - if (p->break_handler) - return -EINVAL; p->flags |= KPROBE_FLAG_FTRACE; #else /* !KPROBES_CAN_USE_FTRACE */ return -EINVAL; -- cgit v1.2.2 From be45c900fdc2c66baad5a7703fb8136991d88aeb Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 13 Sep 2012 09:50:55 +0200 Subject: cgroup: Remove CGROUP_BUILTIN_SUBSYS_COUNT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CGROUP_BUILTIN_SUBSYS_COUNT is used as start index or stop index when looping over the subsys array looking either at the builtin or the module subsystems. Since all the builtin subsystems have an id which is lower then CGROUP_BUILTIN_SUBSYS_COUNT we know that any module will have an id larger than CGROUP_BUILTIN_SUBSYS_COUNT. In short the ids are sorted. We are about to change id assignment to happen only at compile time later in this series. That means we can't rely on the above trick since all ids will always be defined at compile time. Furthermore, ordering the builtin subsystems and the module subsystems is not really necessary. So we need a different way to know which subsystem is a builtin or a module one. We can use the subsys[]->module pointer for this. Any place where we need to know if a subsys is module we just check for the pointer. If it is NULL then the subsystem is a builtin one. With this we are able to drop the CGROUP_BUILTIN_SUBSYS_COUNT enum. Though we need to introduce a temporary placeholder so that we don't get a compilation error when only CONFIG_CGROUP is selected and no single controller. An empty enum definition is not valid. Later in this series we are able to remove the placeholder again. And with this change we get a fix for this: kernel/cgroup.c: In function ‘cgroup_load_subsys’: kernel/cgroup.c:4326:38: warning: array subscript is below array bounds [-Warray-bounds] when CONFIG_CGROUP=y and no built in controller was enabled. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: Gao feng Cc: Jamal Hadi Salim Cc: John Fastabend Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- kernel/cgroup.c | 68 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ced292d720b9..1b18090269ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -88,7 +88,7 @@ static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by * cgroup_mutex. */ @@ -1321,7 +1321,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * take duplicate reference counts on a subsystem that's already used, * but rebind_subsystems handles this case. */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; if (!(bit & opts->subsys_mask)) @@ -1337,7 +1337,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * raced with a module_delete call, and to the user this is * essentially a "subsystem doesn't exist" case. */ - for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + for (i--; i >= 0; i--) { /* drop refcounts only on the ones we took */ unsigned long bit = 1UL << i; @@ -1354,7 +1354,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) static void drop_parsed_module_refcounts(unsigned long subsys_mask) { int i; - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; if (!(bit & subsys_mask)) @@ -4442,8 +4442,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * since cgroup_init_subsys will have already taken care of it. */ if (ss->module == NULL) { - /* a few sanity checks */ - BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + /* a sanity check */ BUG_ON(subsys[ss->subsys_id] != ss); return 0; } @@ -4457,7 +4456,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) */ mutex_lock(&cgroup_mutex); /* find the first empty slot in the array */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { if (subsys[i] == NULL) break; } @@ -4560,7 +4559,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); /* deassign the subsys_id */ - BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ @@ -4623,10 +4621,13 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; + BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->create); @@ -4659,9 +4660,12 @@ int __init cgroup_init(void) if (err) return err; - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) @@ -4856,13 +4860,16 @@ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; - /* - * forkexit callbacks are only supported for builtin - * subsystems, and the builtin section of the subsys array is - * immutable, so we don't need to lock the subsys array here. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* + * forkexit callbacks are only supported for + * builtin subsystems. + */ + if (!ss || ss->module) + continue; + if (ss->fork) ss->fork(child); } @@ -4967,12 +4974,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* modular subsystems can't use callbacks */ + if (!ss || ss->module) + continue; + if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -5158,13 +5166,17 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about module - * subsystems, so we don't worry about them. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* + * cgroup_disable, being at boot time, can't + * know about module subsystems, so we don't + * worry about them. + */ + if (!ss || ss->module) + continue; + if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" -- cgit v1.2.2 From 5fc0b02544b3b9bd3db5a8156b5f3e7350f8e797 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:05 +0200 Subject: cgroup: Wrap subsystem selection macro Before we are able to define all subsystem ids at compile time we need a more fine grained control what gets defined when we include cgroup_subsys.h. For example we define the enums for the subsystems or to declare for struct cgroup_subsys (builtin subsystem) by including cgroup_subsys.h and defining SUBSYS accordingly. Currently, the decision if a subsys is used is defined inside the header by testing if CONFIG_*=y is true. By moving this test outside of cgroup_subsys.h we are able to control it on the include level. This is done by introducing IS_SUBSYS_ENABLED which then is defined according the task, e.g. is CONFIG_*=y or CONFIG_*=m. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: Gao feng Cc: Jamal Hadi Salim Cc: John Fastabend Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1b18090269ad..95e9f3fdb729 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -93,6 +93,7 @@ static DEFINE_MUTEX(cgroup_root_mutex); * cgroup_mutex. */ #define SUBSYS(_x) &_x ## _subsys, +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include }; -- cgit v1.2.2 From 80f4c87774721e864d5a5a1f7aca3e95fd90e194 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:06 +0200 Subject: cgroup: Do not depend on a given order when populating the subsys array The *_subsys_id will be used as index to access the subsys. Therefore we need to care we populate the subsystem at the correct position by using designated initialization. With this change we are able to interleave builtin and modules in the subsys array. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: Gao feng Cc: Jamal Hadi Salim Cc: John Fastabend Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95e9f3fdb729..2bfc78f531b6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -92,7 +92,7 @@ static DEFINE_MUTEX(cgroup_root_mutex); * registered after that. The mutable section of this array is protected by * cgroup_mutex. */ -#define SUBSYS(_x) &_x ## _subsys, +#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include -- cgit v1.2.2 From 8a8e04df4747661daaee77e98e102d99c9e09b98 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:07 +0200 Subject: cgroup: Assign subsystem IDs during compile time WARNING: With this change it is impossible to load external built controllers anymore. In case where CONFIG_NETPRIO_CGROUP=m and CONFIG_NET_CLS_CGROUP=m is set, corresponding subsys_id should also be a constant. Up to now, net_prio_subsys_id and net_cls_subsys_id would be of the type int and the value would be assigned during runtime. By switching the macro definition IS_SUBSYS_ENABLED from IS_BUILTIN to IS_ENABLED, all *_subsys_id will have constant value. That means we need to remove all the code which assumes a value can be assigned to net_prio_subsys_id and net_cls_subsys_id. A close look is necessary on the RCU part which was introduces by following patch: commit f845172531fb7410c7fb7780b1a6e51ee6df7d52 Author: Herbert Xu Mon May 24 09:12:34 2010 Committer: David S. Miller Mon May 24 09:12:34 2010 cls_cgroup: Store classid in struct sock Tis code was added to init_cgroup_cls() /* We can't use rcu_assign_pointer because this is an int. */ smp_wmb(); net_cls_subsys_id = net_cls_subsys.subsys_id; respectively to exit_cgroup_cls() net_cls_subsys_id = -1; synchronize_rcu(); and in module version of task_cls_classid() rcu_read_lock(); id = rcu_dereference(net_cls_subsys_id); if (id >= 0) classid = container_of(task_subsys_state(p, id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); Without an explicit explaination why the RCU part is needed. (The rcu_deference was fixed by exchanging it to rcu_derefence_index_check() in a later commit, but that is a minor detail.) So here is my pondering why it was introduced and why it safe to remove it now. Note that this code was copied over to net_prio the reasoning holds for that subsystem too. The idea behind the RCU use for net_cls_subsys_id is to make sure we get a valid pointer back from task_subsys_state(). task_subsys_state() is just blindly accessing the subsys array and returning the pointer. Obviously, passing in -1 as id into task_subsys_state() returns an invalid value (out of lower bound). So this code makes sure that only after module is loaded and the subsystem registered, the id is assigned. Before unregistering the module all old readers must have left the critical section. This is done by assigning -1 to the id and issuing a synchronized_rcu(). Any new readers wont call task_subsys_state() anymore and therefore it is safe to unregister the subsystem. The new code relies on the same trick, but it looks at the subsys pointer return by task_subsys_state() (remember the id is constant and therefore we allways have a valid index into the subsys array). No precautions need to be taken during module loading module. Eventually, all CPUs will get a valid pointer back from task_subsys_state() because rebind_subsystem() which is called after the module init() function will assigned subsys[net_cls_subsys_id] the newly loaded module subsystem pointer. When the subsystem is about to be removed, rebind_subsystem() will called before the module exit() function. In this case, rebind_subsys() will assign subsys[net_cls_subsys_id] a NULL pointer and then it calls synchronize_rcu(). All old readers have left by then the critical section. Any new reader wont access the subsystem anymore. At this point we are safe to unregister the subsystem. No synchronize_rcu() call is needed. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: "David S. Miller" Cc: "Paul E. McKenney" Cc: Andrew Morton Cc: Eric Dumazet Cc: Gao feng Cc: Glauber Costa Cc: Herbert Xu Cc: Jamal Hadi Salim Cc: John Fastabend Cc: Kamezawa Hiroyuki Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- kernel/cgroup.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2bfc78f531b6..485cc1487ea2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4451,24 +4451,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* init base cftset */ cgroup_init_cftsets(ss); - /* - * need to register a subsys id before anything else - for example, - * init_cgroup_css needs it. - */ mutex_lock(&cgroup_mutex); - /* find the first empty slot in the array */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (subsys[i] == NULL) - break; - } - if (i == CGROUP_SUBSYS_COUNT) { - /* maximum number of subsystems already registered! */ - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - /* assign ourselves the subsys_id */ - ss->subsys_id = i; - subsys[i] = ss; + subsys[ss->subsys_id] = ss; /* * no ss->create seems to need anything important in the ss struct, so @@ -4477,7 +4461,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } @@ -4493,7 +4477,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (ret) { dummytop->subsys[ss->subsys_id] = NULL; ss->destroy(dummytop); - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return ret; } -- cgit v1.2.2 From 8c7f6edbda01f1b1a2e60ad61f14fe38023e433b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Sep 2012 12:20:58 -0700 Subject: cgroup: mark subsystems with broken hierarchy support and whine if cgroups are nested for them Currently, cgroup hierarchy support is a mess. cpu related subsystems behave correctly - configuration, accounting and control on a parent properly cover its children. blkio and freezer completely ignore hierarchy and treat all cgroups as if they're directly under the root cgroup. Others show yet different behaviors. These differing interpretations of cgroup hierarchy make using cgroup confusing and it impossible to co-mount controllers into the same hierarchy and obtain sane behavior. Eventually, we want full hierarchy support from all subsystems and probably a unified hierarchy. Users using separate hierarchies expecting completely different behaviors depending on the mounted subsystem is deterimental to making any progress on this front. This patch adds cgroup_subsys.broken_hierarchy and sets it to %true for controllers which are lacking in hierarchy support. The goal of this patch is two-fold. * Move users away from using hierarchy on currently non-hierarchical subsystems, so that implementing proper hierarchy support on those doesn't surprise them. * Keep track of which controllers are broken how and nudge the subsystems to implement proper hierarchy support. For now, start with a single warning message. We can whine louder later on. v2: Fixed a typo spotted by Michal. Warning message updated. v3: Updated memcg part so that it doesn't generate warning in the cases where .use_hierarchy=false doesn't make the behavior different from root.use_hierarchy=true. Fixed a typo spotted by Glauber. v4: Check ->broken_hierarchy after cgroup creation is complete so that ->create() can affect the result per Michal. Dropped unnecessary memcg root handling per Michal. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan Acked-by: Serge E. Hallyn Cc: Glauber Costa Cc: Peter Zijlstra Cc: Paul Turner Cc: Johannes Weiner Cc: Thomas Graf Cc: Vivek Goyal Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Neil Horman Cc: Aneesh Kumar K.V --- kernel/cgroup.c | 12 +++++++++++- kernel/cgroup_freezer.c | 8 ++++++++ kernel/events/core.c | 7 +++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..b7d9606b17d7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3954,8 +3954,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(cgrp); + struct cgroup_subsys_state *css; + css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; @@ -3969,6 +3970,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) ss->post_clone(cgrp); + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } } list_add(&cgrp->sibling, &cgrp->parent->children); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3649fc6b3eaa..b1724ce98981 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = { .can_attach = freezer_can_attach, .fork = freezer_fork, .base_cftypes = files, + + /* + * freezer subsys doesn't handle hierarchy at all. Frozen state + * should be inherited through the hierarchy - if a parent is + * frozen, all its children should be frozen. Fix it and remove + * the following. + */ + .broken_hierarchy = true, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..f18a0a56e5aa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7285,5 +7285,12 @@ struct cgroup_subsys perf_subsys = { .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, + + /* + * perf_event cgroup doesn't handle nesting correctly. + * ctx->nr_cgroups adjustments should be propagated through the + * cgroup hierarchy. Fix it and remove the following. + */ + .broken_hierarchy = true, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.2 From 6d1d8dfa8b65831cfa9a528e3d17efa7e7f4226c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 30 Aug 2012 19:26:22 +0200 Subject: uprobes: Don't put NULL pointer in uprobe_register() alloc_uprobe() might return a NULL pointer, put_uprobe() can't deal with this. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1666632e6edf..336f06948de1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -897,7 +897,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } mutex_unlock(uprobes_hash(inode)); - put_uprobe(uprobe); + if (uprobe) + put_uprobe(uprobe); return ret; } -- cgit v1.2.2 From 6f47caa0e1e4887aa2ddca8388d058d35725d815 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 18 Aug 2012 17:01:57 +0200 Subject: uprobes: uprobes_treelock should not disable irqs Nobody plays with uprobes_tree/uprobes_treelock in interrupt context, no need to disable irqs. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 336f06948de1..ba9f1e7c6060 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -411,11 +411,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - unsigned long flags; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); return uprobe; } @@ -462,12 +461,11 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { - unsigned long flags; struct uprobe *u; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); /* For now assume that the instruction need not be single-stepped */ uprobe->flags |= UPROBE_SKIP_SSTEP; @@ -705,11 +703,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { - unsigned long flags; - - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); atomic_dec(&uprobe_events); @@ -968,7 +964,6 @@ static void build_probe_list(struct inode *inode, struct list_head *head) { loff_t min, max; - unsigned long flags; struct rb_node *n, *t; struct uprobe *u; @@ -976,7 +971,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -994,7 +989,7 @@ static void build_probe_list(struct inode *inode, atomic_inc(&u->ref); } } - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); } /* -- cgit v1.2.2 From 9f68f672c47b9bd4cfe0a667ecb0b1382c61e2de Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Aug 2012 16:15:09 +0200 Subject: uprobes: Introduce MMF_RECALC_UPROBES Add the new MMF_RECALC_UPROBES flag, it means that MMF_HAS_UPROBES can be false positive after remove_breakpoint() or uprobe_munmap(). It is also set by uprobe_dup_mmap(), this is not optimal but simple. We could add the new hook, uprobe_dup_vma(), to set MMF_HAS_UPROBES only if the new mm actually has uprobes, but I don't think this makes sense. The next patch will use this flag to clear MMF_HAS_UPROBES. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ba9f1e7c6060..9a7f08bab91f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -684,7 +684,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, set_bit(MMF_HAS_UPROBES, &mm->flags); ret = set_swbp(&uprobe->arch, mm, vaddr); - if (ret && first_uprobe) + if (!ret) + clear_bit(MMF_RECALC_UPROBES, &mm->flags); + else if (first_uprobe) clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; @@ -693,6 +695,11 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { + /* can happen if uprobe_register() fails */ + if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) + return; + + set_bit(MMF_RECALC_UPROBES, &mm->flags); set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -1026,6 +1033,25 @@ int uprobe_mmap(struct vm_area_struct *vma) return 0; } +static bool +vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + loff_t min, max; + struct inode *inode; + struct rb_node *n; + + inode = vma->vm_file->f_mapping->host; + + min = vaddr_to_offset(vma, start); + max = min + (end - start) - 1; + + spin_lock(&uprobes_treelock); + n = find_node_in_range(inode, min, max); + spin_unlock(&uprobes_treelock); + + return !!n; +} + /* * Called in context of a munmap of a vma. */ @@ -1037,10 +1063,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || + test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; - /* TODO: unmapping uprobe(s) will need more work */ + if (vma_has_uprobes(vma, start, end)) + set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } /* Slot allocation for XOL */ @@ -1146,8 +1174,11 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); + /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ + set_bit(MMF_RECALC_UPROBES, &newmm->flags); + } } /* -- cgit v1.2.2 From 499a4f3ec057a0f79636cc3c1e581bb6e977a30f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Aug 2012 17:41:34 +0200 Subject: uprobes: Teach find_active_uprobe() to clear MMF_HAS_UPROBES The wrong MMF_HAS_UPROBES doesn't really hurt, just it triggers the "slow" and unnecessary handle_swbp() path if the task hits the non-uprobe breakpoint. So this patch changes find_active_uprobe() to check every valid vma and clear MMF_HAS_UPROBES if no uprobes were found. This is adds the slow O(n) path, but it is only called in unlikely case when the task hits the normal breakpoint first time after uprobe_unregister(). Note the "not strictly accurate" comment in mmf_recalc_uprobes(). We can fix this, we only need to teach vma_has_uprobes() to return a bit more more info, but I am not sure this worth the trouble. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9a7f08bab91f..e4a906ce2e1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1396,6 +1396,25 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } +static void mmf_recalc_uprobes(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!valid_vma(vma, false)) + continue; + /* + * This is not strictly accurate, we can race with + * uprobe_unregister() and see the already removed + * uprobe if delete_uprobe() was not yet called. + */ + if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) + return; + } + + clear_bit(MMF_HAS_UPROBES, &mm->flags); +} + static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; @@ -1417,6 +1436,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } + + if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + mmf_recalc_uprobes(mm); up_read(&mm->mmap_sem); return uprobe; -- cgit v1.2.2 From 9d778782266f95e5c6ec43ed8195ba331c821018 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Aug 2012 18:12:28 +0200 Subject: uprobes: Introduce arch_uprobe_enable/disable_step() As Oleg pointed out in [0] uprobe should not use the ptrace interface for enabling/disabling single stepping. [0] http://lkml.kernel.org/r/20120730141638.GA5306@redhat.com Add the new "__weak arch" helpers which simply call user_*_single_step() as a preparation. This is only needed to not break the powerpc port, we will fold this logic into arch_uprobe_pre/post_xol() hooks later. We should also change handle_singlestep(), _disable_step(&uprobe->arch) should be called before put_uprobe(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4a906ce2e1d..912ef48d28ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1444,6 +1444,16 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) +{ + user_enable_single_step(current); +} + +void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) +{ + user_disable_single_step(current); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1490,7 +1500,7 @@ static void handle_swbp(struct pt_regs *regs) utask->state = UTASK_SSTEP; if (!pre_ssout(uprobe, regs, bp_vaddr)) { - user_enable_single_step(current); + arch_uprobe_enable_step(&uprobe->arch); return; } @@ -1526,10 +1536,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) else WARN_ON_ONCE(1); + arch_uprobe_disable_step(&uprobe->arch); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - user_disable_single_step(current); xol_free_insn_slot(current); spin_lock_irq(¤t->sighand->siglock); -- cgit v1.2.2 From 37407ea7f93864c2cfc03edf8f37872ec539ea2b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Sep 2012 12:29:43 -0700 Subject: Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturbations" This reverts commit 970e178985cadbca660feb02f4d2ee3a09f7fdda. Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20% performance drop on PostgreSQL 9.2 on his machine (running "pgbench"). Borislav Petkov was able to reproduce this, and bisected it to this commit 970e178985ca ("sched: Improve scalability via 'CPU buddies' ...") apparently because the new single-idle-buddy model simply doesn't find idle CPU's to reschedule on aggressively enough. Mike Galbraith suspects that it is likely due to the user-mode spinlocks in PostgreSQL not reacting well to preemption, but we don't really know the details - I'll just revert the commit for now. There are hopefully other approaches to improve scheduler scalability without it causing these kinds of downsides. Reported-by: Nikolay Ulyanitsky Bisected-by: Borislav Petkov Acked-by: Mike Galbraith Cc: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 39 +-------------------------------------- kernel/sched/fair.c | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4ea245f3d85..649c9f876cb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6014,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6032,40 +6027,8 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - struct sched_domain *tmp = sd; - struct sched_group *sg, *prev; - bool right; - - /* - * Traverse to first CPU in group, and count hops - * to cpu from there, switching direction on each - * hop, never ever pointing the last CPU rightward. - */ - do { - id = cpumask_first(sched_domain_span(tmp)); - prev = sg = tmp->groups; - right = 1; - - while (cpumask_first(sched_group_cpus(sg)) != id) - sg = sg->next; - - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { - prev = sg; - sg = sg->next; - right = !right; - } - - /* A CPU went down, never point back to domain start. */ - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) - right = false; - - sg = right ? sg->next : prev; - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); - } while ((tmp = tmp->child)); - + if (sd) id = cpumask_first(sched_domain_span(sd)); - } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42d9df6a5ca4..96e2b18b6283 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, check assigned siblings to find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) - continue; - if (idle_cpu(sd->idle_buddy)) - return sd->idle_buddy; - } + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: return target; } -- cgit v1.2.2 From 5c4233697c3f5cb14eb7a969332e2d60f357f952 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 14 Aug 2012 17:08:45 +0100 Subject: arm64: Add support for /proc/sys/debug/exception-trace This patch allows setting of the show_unhandled_signals variable via /proc/sys/debug/exception-trace. The default value is currently 1 showing unhandled user faults (undefined instructions, data aborts) and invalid signal stack frames. Signed-off-by: Catalin Marinas Acked-by: Tony Lindgren Acked-by: Arnd Bergmann Acked-by: Nicolas Pitre Acked-by: Olof Johansson Acked-by: Santosh Shilimkar --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..79dcb0063182 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1544,7 +1544,7 @@ static struct ctl_table fs_table[] = { static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) + defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64) { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.2.2 From 579035dc5ddd6d48fd8529e7358b03d911ab9d8a Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 17 Sep 2012 14:09:12 -0700 Subject: pid-namespace: limit value of ns_last_pid to (0, max_pid) The kernel doesn't check the pid for negative values, so if you try to write -2 to /proc/sys/kernel/ns_last_pid, you will get a kernel panic. The crash happens because the next pid is -1, and alloc_pidmap() will try to access to a nonexistent pidmap. map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; Signed-off-by: Andrew Vagin Acked-by: Cyrill Gorcunov Acked-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b3c7fd554250..6144bab8fd8e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, */ tmp.data = ¤t->nsproxy->pid_ns->last_pid; - return proc_dointvec(&tmp, write, buffer, lenp, ppos); + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } +extern int pid_max; +static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, + .extra1 = &zero, + .extra2 = &pid_max, }, { } }; -- cgit v1.2.2 From 960bd11bf2daf669d0d910428fd9ef5a15c3d7cb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 17 Sep 2012 15:42:31 -0700 Subject: workqueue: always clear WORKER_REBIND in busy_worker_rebind_fn() busy_worker_rebind_fn() didn't clear WORKER_REBIND if rebinding failed (CPU is down again). This used to be okay because the flag wasn't used for anything else. However, after 25511a477 "workqueue: reimplement CPU online rebinding to handle idle workers", WORKER_REBIND is also used to command idle workers to rebind. If not cleared, the worker may confuse the next CPU_UP cycle by having REBIND spuriously set or oops / get stuck by prematurely calling idle_worker_rebind(). WARNING: at /work/os/wq/kernel/workqueue.c:1323 worker_thread+0x4cd/0x5 00() Hardware name: Bochs Modules linked in: test_wq(O-) Pid: 33, comm: kworker/1:1 Tainted: G O 3.6.0-rc1-work+ #3 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] worker_thread+0x4cd/0x500 [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 ---[ end trace e977cf20f4661968 ]--- BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] worker_thread+0x360/0x500 PGD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: test_wq(O-) CPU 0 Pid: 33, comm: kworker/1:1 Tainted: G W O 3.6.0-rc1-work+ #3 Bochs Bochs RIP: 0010:[] [] worker_thread+0x360/0x500 RSP: 0018:ffff88001e1c9de0 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffff88001e633e00 RCX: 0000000000004140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000009 RBP: ffff88001e1c9ea0 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000002 R11: 0000000000000000 R12: ffff88001fc8d580 R13: ffff88001fc8d590 R14: ffff88001e633e20 R15: ffff88001e1c6900 FS: 0000000000000000(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000000130e8000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/1:1 (pid: 33, threadinfo ffff88001e1c8000, task ffff88001e1c6900) Stack: ffff880000000000 ffff88001e1c9e40 0000000000000001 ffff88001e1c8010 ffff88001e519c78 ffff88001e1c9e58 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001fc8d340 ffff88001fc8d340 Call Trace: [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 Code: b1 00 f6 43 48 02 0f 85 91 01 00 00 48 8b 43 38 48 89 df 48 8b 00 48 89 45 90 e8 ac f0 ff ff 3c 01 0f 85 60 01 00 00 48 8b 53 50 <8b> 02 83 e8 01 85 c0 89 02 0f 84 3b 01 00 00 48 8b 43 38 48 8b RIP [] worker_thread+0x360/0x500 RSP CR2: 0000000000000000 There was no reason to keep WORKER_REBIND on failure in the first place - WORKER_UNBOUND is guaranteed to be set in such cases preventing incorrectly activating concurrency management. Always clear WORKER_REBIND. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e1373bcb3e3..b80065a2450a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1349,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + worker_maybe_bind_and_lock(worker); + + /* + * %WORKER_REBIND must be cleared even if the above binding failed; + * otherwise, we may confuse the next CPU_UP cycle or oops / get + * stuck by calling idle_worker_rebind() prematurely. If CPU went + * down again inbetween, %WORKER_UNBOUND would be set, so clearing + * %WORKER_REBIND is always safe. + */ + worker_clr_flags(worker, WORKER_REBIND); spin_unlock_irq(&gcwq->lock); } -- cgit v1.2.2 From 34e36d8ecbd958bc15f8e63deade1227de337eb1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 23:20:20 -0700 Subject: audit: Limit audit requests to processes in the initial pid and user namespaces. This allows the code to safely make the assumption that all of the uids gids and pids that need to be send in audit messages are in the initial namespaces. If someone cares we may lift this restriction someday but start with limiting access so at least the code is always correct. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..7b7268e3073b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -61,6 +61,7 @@ #include #include #include +#include #include "audit.h" @@ -588,6 +589,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; + /* Only support the initial namespaces for now. */ + if ((current_user_ns() != &init_user_ns) || + (task_active_pid_ns(current) != &init_pid_ns)) + return -EPERM; + switch (msg_type) { case AUDIT_GET: case AUDIT_LIST: -- cgit v1.2.2 From 02276bda4a2bf094fcde89fb5db4d9e86347ebf4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 23:10:16 -0700 Subject: audit: Use current instead of NETLINK_CREDS() in audit_filter Get caller process uid and gid and pid values from the current task instead of the NETLINK_CB. This is simpler than passing NETLINK_CREDS from from audit_receive_msg to audit_filter_user_rules and avoid the chance of being hit by the occassional bugs in netlink uid/gid credential passing. This is a safe changes because all netlink requests are processed in the task of the sending process. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 2 +- kernel/auditfilter.c | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7b7268e3073b..fecb1507b485 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -744,7 +744,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb)); + err = audit_filter_user(); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6c3f1abd206..b754f43bc56c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1236,8 +1236,7 @@ int audit_compare_dname_path(const char *dname, const char *path, return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule, enum audit_state *state) { int i; @@ -1249,13 +1248,13 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, switch (f->type) { case AUDIT_PID: - result = audit_comparator(cb->creds.pid, f->op, f->val); + result = audit_comparator(task_pid_vnr(current), f->op, f->val); break; case AUDIT_UID: - result = audit_comparator(cb->creds.uid, f->op, f->val); + result = audit_comparator(current_uid(), f->op, f->val); break; case AUDIT_GID: - result = audit_comparator(cb->creds.gid, f->op, f->val); + result = audit_comparator(current_gid(), f->op, f->val); break; case AUDIT_LOGINUID: result = audit_comparator(audit_get_loginuid(current), @@ -1287,7 +1286,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb) +int audit_filter_user(void) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1295,7 +1294,7 @@ int audit_filter_user(struct netlink_skb_parms *cb) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (audit_filter_user_rules(&e->rule, &state)) { if (state == AUDIT_DISABLED) ret = 0; break; -- cgit v1.2.2 From f95732e2e0a649c148be0242b72e3c7473092687 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 23:31:17 -0700 Subject: audit: kill audit_prepare_user_tty Now that netlink messages are processed in the context of the sender tty_audit_push_task can be called directly and audit_prepare_user_tty which only added looking up the task of the tty by process id is not needed. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index fecb1507b485..58f704b432e4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -468,24 +468,6 @@ static int kauditd_thread(void *dummy) return 0; } -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) -{ - struct task_struct *tsk; - int err; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(tsk); - rcu_read_unlock(); - err = tty_audit_push_task(tsk, loginuid, sessionid); - put_task_struct(tsk); - return err; -} - int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; @@ -748,7 +730,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid, + err = tty_audit_push_task(current, loginuid, sessionid); if (err) break; -- cgit v1.2.2 From 8aa14b64981ee4b95959e1ed331b672d053aab62 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 23:43:14 -0700 Subject: audit: Simply AUDIT_TTY_SET and AUDIT_TTY_GET Use current instead of looking up the current up the current task by process identifier. Netlink requests are processed in trhe context of the sending task so this is safe. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 58f704b432e4..2a8728fdefc4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -866,41 +866,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk; - unsigned long flags; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - s.enabled = tsk->signal->audit_tty != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); - - if (!err) - audit_send_reply(NETLINK_CB(skb).pid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + s.enabled = tsk->signal->audit_tty != 0; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; - struct task_struct *tsk; - unsigned long flags; + struct task_struct *tsk = current; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - tsk->signal->audit_tty = s->enabled != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); + + spin_lock_irq(&tsk->sighand->siglock); + tsk->signal->audit_tty = s->enabled != 0; + spin_unlock_irq(&tsk->sighand->siglock); break; } default: -- cgit v1.2.2 From 35ce9888ad2a60c95849551e7345bd547714bbff Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 11 Sep 2012 00:12:29 -0700 Subject: audit: Properly set the origin port id of audit messages. For user generated audit messages set the portid field in the netlink header to the netlink port where the user generated audit message came from. Reporting the process id in a port id field was just nonsense. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2a8728fdefc4..9dd4d0936969 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -751,7 +751,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) size--; audit_log_n_untrustedstring(ab, data, size); } - audit_set_pid(ab, pid); + audit_set_pid(ab, NETLINK_CB(skb).pid); audit_log_end(ab); } break; -- cgit v1.2.2 From 017143fecb3364e5fed8107d206799899f5dd684 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 11 Sep 2012 00:19:06 -0700 Subject: audit: Remove the unused uid parameter from audit_receive_filter Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 4 ++-- kernel/auditfilter.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 9dd4d0936969..a31e31bba2d3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -771,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* fallthrough */ case AUDIT_LIST: err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), + seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; case AUDIT_ADD_RULE: @@ -790,7 +790,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* fallthrough */ case AUDIT_LIST_RULES: err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), + seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; case AUDIT_TRIM: diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b754f43bc56c..e242dd9aa2d0 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1098,7 +1098,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, +int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; -- cgit v1.2.2 From 860c0aaff75e714c21d325f32d36a37572b4fffb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 11 Sep 2012 00:24:49 -0700 Subject: audit: Don't pass pid or uid to audit_log_common_recv_msg The only place we use the uid and the pid that we calculate in audit_receive_msg is in audit_log_common_recv_msg so move the calculation of these values into the audit_log_common_recv_msg. Simplify the calcuation of the current pid and uid by reading them from current instead of reading them from NETLINK_CREDS. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a31e31bba2d3..2e0dd5edf69b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -607,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 ses, - u32 sid) + uid_t auid, u32 ses, u32 sid) { int rc = 0; char *ctx = NULL; @@ -621,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - pid, uid, auid, ses); + task_tgid_vnr(current), + from_kuid(&init_user_ns, current_uid()), + auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -637,7 +638,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 uid, pid, seq, sid; + u32 seq, sid; void *data; struct audit_status *status_get, status_set; int err; @@ -663,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } - pid = NETLINK_CREDS(skb)->pid; - uid = NETLINK_CREDS(skb)->uid; loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); @@ -735,7 +734,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - audit_log_common_recv_msg(&ab, msg_type, pid, uid, + audit_log_common_recv_msg(&ab, msg_type, loginuid, sessionid, sid); if (msg_type != AUDIT_USER_TTY) @@ -760,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -779,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -796,8 +795,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); @@ -828,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); -- cgit v1.2.2 From ca57ec0f00c3f139c41bf6b0a5b9bcc95bbb2ad7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 11 Sep 2012 02:18:08 -0700 Subject: audit: Add typespecific uid and gid comparators The audit filter code guarantees that uid are always compared with uids and gids are always compared with gids, as the comparason operations are type specific. Take advantage of this proper to define audit_uid_comparator and audit_gid_comparator which use the type safe comparasons from uidgid.h. Build on audit_uid_comparator and audit_gid_comparator and replace audit_compare_id with audit_compare_uid and audit_compare_gid. This is one of those odd cases where being type safe and duplicating code leads to simpler shorter and more concise code. Don't allow bitmask operations in uid and gid comparisons in audit_data_to_entry. Bitmask operations are already denined in audit_rule_to_entry. Convert constants in audit_rule_to_entry and audit_data_to_entry into kuids and kgids when appropriate. Convert the uid and gid field in struct audit_names to be of type kuid_t and kgid_t respectively, so that the new uid and gid comparators can be applied in a type safe manner. Cc: Al Viro Cc: Eric Paris Signed-off-by: "Eric W. Biederman" --- kernel/audit.h | 2 + kernel/auditfilter.c | 119 ++++++++++++++++++++++++++++++++++++---- kernel/auditsc.c | 150 ++++++++++++++++++++++++--------------------------- 3 files changed, 182 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 816766803371..4b428bb41ea3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -76,6 +76,8 @@ static inline int audit_hash_ino(u32 ino) extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); +extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e242dd9aa2d0..b30320cea26f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; + f->uid = INVALID_UID; + f->gid = INVALID_GID; err = -EINVAL; if (f->op == Audit_bad) @@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) switch(f->type) { default: goto exit_free; - case AUDIT_PID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: + case AUDIT_LOGINUID: + /* bit ops not implemented for uid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->uid = make_kuid(current_user_ns(), f->val); + if (!uid_valid(f->uid)) + goto exit_free; + break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - case AUDIT_LOGINUID: + /* bit ops not implemented for gid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->gid = make_kgid(current_user_ns(), f->val); + if (!gid_valid(f->gid)) + goto exit_free; + break; + case AUDIT_PID: case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: @@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->type = data->fields[i]; f->val = data->values[i]; + f->uid = INVALID_UID; + f->gid = INVALID_GID; f->lsm_str = NULL; f->lsm_rule = NULL; switch(f->type) { - case AUDIT_PID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + /* bit ops not implemented for uid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->uid = make_kuid(current_user_ns(), f->val); + if (!uid_valid(f->uid)) + goto exit_free; + break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - case AUDIT_LOGINUID: + case AUDIT_OBJ_GID: + /* bit ops not implemented for gid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->gid = make_kgid(current_user_ns(), f->val); + if (!gid_valid(f->gid)) + goto exit_free; + break; + case AUDIT_PID: case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: @@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: - case AUDIT_OBJ_UID: - case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) + return 1; + break; + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_OBJ_GID: + if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -1198,6 +1251,52 @@ int audit_comparator(u32 left, u32 op, u32 right) } } +int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) +{ + switch (op) { + case Audit_equal: + return uid_eq(left, right); + case Audit_not_equal: + return !uid_eq(left, right); + case Audit_lt: + return uid_lt(left, right); + case Audit_le: + return uid_lte(left, right); + case Audit_gt: + return uid_gt(left, right); + case Audit_ge: + return uid_gte(left, right); + case Audit_bitmask: + case Audit_bittest: + default: + BUG(); + return 0; + } +} + +int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) +{ + switch (op) { + case Audit_equal: + return gid_eq(left, right); + case Audit_not_equal: + return !gid_eq(left, right); + case Audit_lt: + return gid_lt(left, right); + case Audit_le: + return gid_lte(left, right); + case Audit_gt: + return gid_gt(left, right); + case Audit_ge: + return gid_gte(left, right); + case Audit_bitmask: + case Audit_bittest: + default: + BUG(); + return 0; + } +} + /* Compare given dentry name with last component in given path, * return of 0 indicates a match. */ int audit_compare_dname_path(const char *dname, const char *path, @@ -1251,14 +1350,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, result = audit_comparator(task_pid_vnr(current), f->op, f->val); break; case AUDIT_UID: - result = audit_comparator(current_uid(), f->op, f->val); + result = audit_uid_comparator(current_uid(), f->op, f->uid); break; case AUDIT_GID: - result = audit_comparator(current_gid(), f->op, f->val); + result = audit_gid_comparator(current_gid(), f->op, f->gid); break; case AUDIT_LOGINUID: - result = audit_comparator(audit_get_loginuid(current), - f->op, f->val); + result = audit_uid_comparator(audit_get_loginuid(current), + f->op, f->uid); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b8..0b5b8a232b55 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -113,8 +113,8 @@ struct audit_names { unsigned long ino; dev_t dev; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; dev_t rdev; u32 osid; struct audit_cap_data fcap; @@ -464,37 +464,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } -static int audit_compare_id(uid_t uid1, - struct audit_names *name, - unsigned long name_offset, - struct audit_field *f, - struct audit_context *ctx) +static int audit_compare_uid(kuid_t uid, + struct audit_names *name, + struct audit_field *f, + struct audit_context *ctx) { struct audit_names *n; - unsigned long addr; - uid_t uid2; int rc; - - BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); - + if (name) { - addr = (unsigned long)name; - addr += name_offset; - - uid2 = *(uid_t *)addr; - rc = audit_comparator(uid1, f->op, uid2); + rc = audit_uid_comparator(uid, f->op, name->uid); if (rc) return rc; } - + if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - addr = (unsigned long)n; - addr += name_offset; - - uid2 = *(uid_t *)addr; + rc = audit_uid_comparator(uid, f->op, n->uid); + if (rc) + return rc; + } + } + return 0; +} - rc = audit_comparator(uid1, f->op, uid2); +static int audit_compare_gid(kgid_t gid, + struct audit_names *name, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + int rc; + + if (name) { + rc = audit_gid_comparator(gid, f->op, name->gid); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + rc = audit_gid_comparator(gid, f->op, n->gid); if (rc) return rc; } @@ -511,80 +521,62 @@ static int audit_field_compare(struct task_struct *tsk, switch (f->val) { /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: - return audit_compare_id(cred->uid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->uid, name, f, ctx); case AUDIT_COMPARE_GID_TO_OBJ_GID: - return audit_compare_id(cred->gid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->gid, name, f, ctx); case AUDIT_COMPARE_EUID_TO_OBJ_UID: - return audit_compare_id(cred->euid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->euid, name, f, ctx); case AUDIT_COMPARE_EGID_TO_OBJ_GID: - return audit_compare_id(cred->egid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_id(tsk->loginuid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(tsk->loginuid, name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: - return audit_compare_id(cred->suid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: - return audit_compare_id(cred->sgid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->sgid, name, f, ctx); case AUDIT_COMPARE_FSUID_TO_OBJ_UID: - return audit_compare_id(cred->fsuid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->fsuid, name, f, ctx); case AUDIT_COMPARE_FSGID_TO_OBJ_GID: - return audit_compare_id(cred->fsgid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: - return audit_comparator(cred->uid, f->op, tsk->loginuid); + return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); case AUDIT_COMPARE_UID_TO_EUID: - return audit_comparator(cred->uid, f->op, cred->euid); + return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: - return audit_comparator(cred->uid, f->op, cred->suid); + return audit_uid_comparator(cred->uid, f->op, cred->suid); case AUDIT_COMPARE_UID_TO_FSUID: - return audit_comparator(cred->uid, f->op, cred->fsuid); + return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: - return audit_comparator(tsk->loginuid, f->op, cred->euid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: - return audit_comparator(tsk->loginuid, f->op, cred->suid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: - return audit_comparator(cred->euid, f->op, cred->suid); + return audit_uid_comparator(cred->euid, f->op, cred->suid); case AUDIT_COMPARE_EUID_TO_FSUID: - return audit_comparator(cred->euid, f->op, cred->fsuid); + return audit_uid_comparator(cred->euid, f->op, cred->fsuid); /* suid comparisons */ case AUDIT_COMPARE_SUID_TO_FSUID: - return audit_comparator(cred->suid, f->op, cred->fsuid); + return audit_uid_comparator(cred->suid, f->op, cred->fsuid); /* gid comparisons */ case AUDIT_COMPARE_GID_TO_EGID: - return audit_comparator(cred->gid, f->op, cred->egid); + return audit_gid_comparator(cred->gid, f->op, cred->egid); case AUDIT_COMPARE_GID_TO_SGID: - return audit_comparator(cred->gid, f->op, cred->sgid); + return audit_gid_comparator(cred->gid, f->op, cred->sgid); case AUDIT_COMPARE_GID_TO_FSGID: - return audit_comparator(cred->gid, f->op, cred->fsgid); + return audit_gid_comparator(cred->gid, f->op, cred->fsgid); /* egid comparisons */ case AUDIT_COMPARE_EGID_TO_SGID: - return audit_comparator(cred->egid, f->op, cred->sgid); + return audit_gid_comparator(cred->egid, f->op, cred->sgid); case AUDIT_COMPARE_EGID_TO_FSGID: - return audit_comparator(cred->egid, f->op, cred->fsgid); + return audit_gid_comparator(cred->egid, f->op, cred->fsgid); /* sgid comparison */ case AUDIT_COMPARE_SGID_TO_FSGID: - return audit_comparator(cred->sgid, f->op, cred->fsgid); + return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; @@ -630,28 +622,28 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_UID: - result = audit_comparator(cred->uid, f->op, f->val); + result = audit_uid_comparator(cred->uid, f->op, f->uid); break; case AUDIT_EUID: - result = audit_comparator(cred->euid, f->op, f->val); + result = audit_uid_comparator(cred->euid, f->op, f->uid); break; case AUDIT_SUID: - result = audit_comparator(cred->suid, f->op, f->val); + result = audit_uid_comparator(cred->suid, f->op, f->uid); break; case AUDIT_FSUID: - result = audit_comparator(cred->fsuid, f->op, f->val); + result = audit_uid_comparator(cred->fsuid, f->op, f->uid); break; case AUDIT_GID: - result = audit_comparator(cred->gid, f->op, f->val); + result = audit_gid_comparator(cred->gid, f->op, f->gid); break; case AUDIT_EGID: - result = audit_comparator(cred->egid, f->op, f->val); + result = audit_gid_comparator(cred->egid, f->op, f->gid); break; case AUDIT_SGID: - result = audit_comparator(cred->sgid, f->op, f->val); + result = audit_gid_comparator(cred->sgid, f->op, f->gid); break; case AUDIT_FSGID: - result = audit_comparator(cred->fsgid, f->op, f->val); + result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); @@ -717,10 +709,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_OBJ_UID: if (name) { - result = audit_comparator(name->uid, f->op, f->val); + result = audit_uid_comparator(name->uid, f->op, f->uid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->uid, f->op, f->val)) { + if (audit_uid_comparator(n->uid, f->op, f->uid)) { ++result; break; } @@ -729,10 +721,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_OBJ_GID: if (name) { - result = audit_comparator(name->gid, f->op, f->val); + result = audit_gid_comparator(name->gid, f->op, f->gid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->gid, f->op, f->val)) { + if (audit_gid_comparator(n->gid, f->op, f->gid)) { ++result; break; } @@ -750,7 +742,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID: result = 0; if (ctx) - result = audit_comparator(tsk->loginuid, f->op, f->val); + result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: -- cgit v1.2.2 From e1760bd5ffae8cb98cffb030ee8e631eba28f3d8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 22:39:43 -0700 Subject: userns: Convert the audit loginuid to be a kuid Always store audit loginuids in type kuid_t. Print loginuids by converting them into uids in the appropriate user namespace, and then printing the resulting uid. Modify audit_get_loginuid to return a kuid_t. Modify audit_set_loginuid to take a kuid_t. Modify /proc//loginuid on read to convert the loginuid into the user namespace of the opener of the file. Modify /proc//loginud on write to convert the loginuid rom the user namespace of the opener of the file. Cc: Al Viro Cc: Eric Paris Cc: Paul Moore ? Cc: David Miller Signed-off-by: Eric W. Biederman --- kernel/audit.c | 20 ++++++++++---------- kernel/audit_watch.c | 2 +- kernel/auditfilter.c | 7 ++++--- kernel/auditsc.c | 20 +++++++++++--------- 4 files changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2e0dd5edf69b..44a4b13c9f00 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -265,7 +265,7 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sessionid, u32 sid, + kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; @@ -273,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, loginuid, sessionid); + old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -293,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old, } static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sessionid, + int new, kuid_t loginuid, u32 sessionid, u32 sid) { int allow_changes, rc = 0, old = *to_change; @@ -320,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit, loginuid, sessionid, sid); } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit, loginuid, sessionid, sid); } -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) @@ -349,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) return rc; } -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -607,7 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - uid_t auid, u32 ses, u32 sid) + kuid_t auid, u32 ses, u32 sid) { int rc = 0; char *ctx = NULL; @@ -622,7 +622,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", task_tgid_vnr(current), from_kuid(&init_user_ns, current_uid()), - auid, ses); + from_kuid(&init_user_ns, auid), ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -644,7 +644,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - uid_t loginuid; /* loginuid of sender */ + kuid_t loginuid; /* loginuid of sender */ u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3823281401b5..1c22ec3d87bc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); audit_log_string(ab, op); audit_log_format(ab, " path="); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b30320cea26f..c4bcdbaf4d4d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1109,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, +static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, char *action, struct audit_krule *rule, int res) { @@ -1121,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); + audit_log_format(ab, "auid=%u ses=%u", + from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -1152,7 +1153,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) + size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; struct audit_netlink_list *dest; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0b5b8a232b55..26fdfc092e35 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -149,7 +149,7 @@ struct audit_aux_data_execve { struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; - uid_t target_auid[AUDIT_AUX_PIDS]; + kuid_t target_auid[AUDIT_AUX_PIDS]; uid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; u32 target_sid[AUDIT_AUX_PIDS]; @@ -214,7 +214,7 @@ struct audit_context { int arch; pid_t target_pid; - uid_t target_auid; + kuid_t target_auid; uid_t target_uid; unsigned int target_sessionid; u32 target_sid; @@ -1176,7 +1176,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk } static int audit_log_pid_context(struct audit_context *context, pid_t pid, - uid_t auid, uid_t uid, unsigned int sessionid, + kuid_t auid, uid_t uid, unsigned int sessionid, u32 sid, char *comm) { struct audit_buffer *ab; @@ -1188,7 +1188,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, if (!ab) return rc; - audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, + audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, + from_kuid(&init_user_ns, auid), uid, sessionid); if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); @@ -1630,7 +1631,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->name_count, context->ppid, context->pid, - tsk->loginuid, + from_kuid(&init_user_ns, tsk->loginuid), context->uid, context->gid, context->euid, context->suid, context->fsuid, @@ -2291,14 +2292,14 @@ static atomic_t session_id = ATOMIC_INIT(0); * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(uid_t loginuid) +int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; struct audit_context *context = task->audit_context; unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (task->loginuid != -1) + if (uid_valid(task->loginuid)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) @@ -2315,7 +2316,8 @@ int audit_set_loginuid(uid_t loginuid) "old auid=%u new auid=%u" " old ses=%u new ses=%u", task->pid, task_uid(task), - task->loginuid, loginuid, + from_kuid(&init_user_ns, task->loginuid), + from_kuid(&init_user_ns, loginuid), task->sessionid, sessionid); audit_log_end(ab); } @@ -2543,7 +2545,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; - if (tsk->loginuid != -1) + if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else audit_sig_uid = uid; -- cgit v1.2.2 From cca080d9b622094831672a136e5ee4f702d116b1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 16:53:48 -0800 Subject: userns: Convert audit to work with user namespaces enabled - Explicitly format uids gids in audit messges in the initial user namespace. This is safe because auditd is restrected to be in the initial user namespace. - Convert audit_sig_uid into a kuid_t. - Enable building the audit code and user namespaces at the same time. The net result is that the audit subsystem now uses kuid_t and kgid_t whenever possible making it almost impossible to confuse a raw uid_t with a kuid_t preventing bugs. Cc: Al Viro Cc: Eric Paris Cc: Greg Kroah-Hartman Signed-off-by: Eric W. Biederman --- kernel/audit.c | 4 ++-- kernel/audit.h | 2 +- kernel/auditsc.c | 51 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 34 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 44a4b13c9f00..511488a7bc71 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -105,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ; static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ -uid_t audit_sig_uid = -1; +kuid_t audit_sig_uid = INVALID_UID; pid_t audit_sig_pid = -1; u32 audit_sig_sid = 0; @@ -853,7 +853,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) security_release_secctx(ctx, len); return -ENOMEM; } - sig_data->uid = audit_sig_uid; + sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (audit_sig_sid) { memcpy(sig_data->ctx, ctx, len); diff --git a/kernel/audit.h b/kernel/audit.h index 4b428bb41ea3..9eb3d79482b6 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -146,7 +146,7 @@ extern void audit_kill_trees(struct list_head *); extern char *audit_unpack_string(void **, size_t *, size_t); extern pid_t audit_sig_pid; -extern uid_t audit_sig_uid; +extern kuid_t audit_sig_uid; extern u32 audit_sig_sid; #ifdef CONFIG_AUDITSYSCALL diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 26fdfc092e35..ff4798fcb488 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -150,7 +150,7 @@ struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; kuid_t target_auid[AUDIT_AUX_PIDS]; - uid_t target_uid[AUDIT_AUX_PIDS]; + kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; u32 target_sid[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; @@ -208,14 +208,14 @@ struct audit_context { size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; - uid_t uid, euid, suid, fsuid; - gid_t gid, egid, sgid, fsgid; + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; unsigned long personality; int arch; pid_t target_pid; kuid_t target_auid; - uid_t target_uid; + kuid_t target_uid; unsigned int target_sessionid; u32 target_sid; char target_comm[TASK_COMM_LEN]; @@ -231,8 +231,8 @@ struct audit_context { long args[6]; } socketcall; struct { - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; u32 osid; int has_perm; @@ -1176,7 +1176,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk } static int audit_log_pid_context(struct audit_context *context, pid_t pid, - kuid_t auid, uid_t uid, unsigned int sessionid, + kuid_t auid, kuid_t uid, unsigned int sessionid, u32 sid, char *comm) { struct audit_buffer *ab; @@ -1190,7 +1190,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), - uid, sessionid); + from_kuid(&init_user_ns, uid), sessionid); if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; @@ -1440,7 +1440,9 @@ static void show_special(struct audit_context *context, int *call_panic) u32 osid = context->ipc.osid; audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", - context->ipc.uid, context->ipc.gid, context->ipc.mode); + from_kuid(&init_user_ns, context->ipc.uid), + from_kgid(&init_user_ns, context->ipc.gid), + context->ipc.mode); if (osid) { char *ctx = NULL; u32 len; @@ -1553,8 +1555,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, MAJOR(n->dev), MINOR(n->dev), n->mode, - n->uid, - n->gid, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); } @@ -1632,10 +1634,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->ppid, context->pid, from_kuid(&init_user_ns, tsk->loginuid), - context->uid, - context->gid, - context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid, tty, + from_kuid(&init_user_ns, context->uid), + from_kgid(&init_user_ns, context->gid), + from_kuid(&init_user_ns, context->euid), + from_kuid(&init_user_ns, context->suid), + from_kuid(&init_user_ns, context->fsuid), + from_kgid(&init_user_ns, context->egid), + from_kgid(&init_user_ns, context->sgid), + from_kgid(&init_user_ns, context->fsgid), + tty, tsk->sessionid); @@ -2315,7 +2322,8 @@ int audit_set_loginuid(kuid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task_uid(task), + task->pid, + from_kuid(&init_user_ns, task_uid(task)), from_kuid(&init_user_ns, task->loginuid), from_kuid(&init_user_ns, loginuid), task->sessionid, sessionid); @@ -2540,7 +2548,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; - uid_t uid = current_uid(), t_uid = task_uid(t); + kuid_t uid = current_uid(), t_uid = task_uid(t); if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { @@ -2666,8 +2674,8 @@ void __audit_mmap_fd(int fd, int flags) static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) { - uid_t auid, uid; - gid_t gid; + kuid_t auid, uid; + kgid_t gid; unsigned int sessionid; auid = audit_get_loginuid(current); @@ -2675,7 +2683,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); + from_kuid(&init_user_ns, auid), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); -- cgit v1.2.2 From 4bd6e32acec66c55c6c1af4672f3216b2ac88e35 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 17:56:49 -0800 Subject: userns: Convert taskstats to handle the user and pid namespaces. - Explicitly limit exit task stat broadcast to the initial user and pid namespaces, as it is already limited to the initial network namespace. - For broadcast task stats explicitly generate all of the idenitiers in terms of the initial user namespace and the initial pid namespace. - For request stats report them in terms of the current user namespace and the current pid namespace. Netlink messages are delivered syncrhonously to the kernel allowing us to get the user namespace and the pid namespace from the current task. - Pass the namespaces for representing pids and uids and gids into bacct_add_task. Cc: Balbir Singh Signed-off-by: Eric W. Biederman --- kernel/taskstats.c | 23 +++++++++++++++++------ kernel/tsacct.c | 12 +++++++----- 2 files changed, 24 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..3880df2acf05 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } -static void fill_stats(struct task_struct *tsk, struct taskstats *stats) +static void fill_stats(struct user_namespace *user_ns, + struct pid_namespace *pid_ns, + struct task_struct *tsk, struct taskstats *stats) { memset(stats, 0, sizeof(*stats)); /* @@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats) stats->version = TASKSTATS_VERSION; stats->nvcsw = tsk->nvcsw; stats->nivcsw = tsk->nivcsw; - bacct_add_tsk(stats, tsk); + bacct_add_tsk(user_ns, pid_ns, stats, tsk); /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); @@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) rcu_read_unlock(); if (!tsk) return -ESRCH; - fill_stats(tsk, stats); + fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); put_task_struct(tsk); return 0; } @@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; + if (current_user_ns() != &init_user_ns) + return -EINVAL; + + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + if (isadd == REGISTER) { for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), @@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (rc < 0) return; - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, + task_pid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; - fill_stats(tsk, stats); + fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!is_thread_group || !group_dead) goto send; - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, + task_tgid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 23b4d784ebdd..625df0b44690 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -26,7 +26,9 @@ /* * fill in basic accounting fields */ -void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +void bacct_add_tsk(struct user_namespace *user_ns, + struct pid_namespace *pid_ns, + struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; struct timespec uptime, ts; @@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_pid = tsk->pid; + stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); - stats->ac_uid = tcred->uid; - stats->ac_gid = tcred->gid; + stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); + stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); stats->ac_ppid = pid_alive(tsk) ? - rcu_dereference(tsk->real_parent)->tgid : 0; + task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); stats->ac_utime = cputime_to_usecs(tsk->utime); stats->ac_stime = cputime_to_usecs(tsk->stime); -- cgit v1.2.2 From f8f3d4de2d04e1a5b4293b67faee8ebabc64e9fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 16:54:50 -0800 Subject: userns: Convert bsd process accounting to use kuid and kgid where appropriate BSD process accounting conveniently passes the file the accounting records will be written into to do_acct_process. The file credentials captured the user namespace of the opener of the file. Use the file credentials to format the uid and the gid of the current process into the user namespace of the user that started the bsd process accounting. Cc: Pavel Emelyanov Reviewed-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 02e6167a53b0..6cd7529c9e6a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - ac.ac_uid = orig_cred->uid; - ac.ac_gid = orig_cred->gid; + ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); + ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif -- cgit v1.2.2 From d20b92ab668cc44fc84bba0001839c5a8013a5cd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 13 Mar 2012 16:02:19 -0700 Subject: userns: Teach trace to use from_kuid - When tracing capture the kuid. - When displaying the data to user space convert the kuid into the user namespace of the process that opened the report file. Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Eric W. Biederman --- kernel/trace/trace.c | 3 ++- kernel/trace/trace.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496ce..c9ace838d509 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2060,7 +2060,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", - data->comm, data->pid, data->uid, data->nice, + data->comm, data->pid, + from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..40a6f30c985f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -147,7 +147,7 @@ struct trace_array_cpu { unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; - uid_t uid; + kuid_t uid; char comm[TASK_COMM_LEN]; }; -- cgit v1.2.2 From f76d207a66c3a53defea67e7d36c3eb1b7d6d61d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 30 Aug 2012 01:24:05 -0700 Subject: userns: Add kprojid_t and associated infrastructure in projid.h Implement kprojid_t a cousin of the kuid_t and kgid_t. The per user namespace mapping of project id values can be set with /proc//projid_map. A full compliment of helpers is provided: make_kprojid, from_kprojid, from_kprojid_munged, kporjid_has_mapping, projid_valid, projid_eq, projid_eq, projid_lt. Project identifiers are part of the generic disk quota interface, although it appears only xfs implements project identifiers currently. The xfs code allows anyone who has permission to set the project identifier on a file to use any project identifier so when setting up the user namespace project identifier mappings I do not require a capability. Cc: Dave Chinner Cc: Jan Kara Signed-off-by: "Eric W. Biederman" --- kernel/user.c | 8 +++ kernel/user_namespace.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index b815fefbe76f..750acffbe9ec 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -38,6 +38,14 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, + .projid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, .kref = { .refcount = ATOMIC_INIT(3), }, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86602316422d..456a6b9fba34 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -19,6 +19,7 @@ #include #include #include +#include static struct kmem_cache *user_ns_cachep __read_mostly; @@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) } EXPORT_SYMBOL(from_kgid_munged); +/** + * make_kprojid - Map a user-namespace projid pair into a kprojid. + * @ns: User namespace that the projid is in + * @projid: Project identifier + * + * Maps a user-namespace uid pair into a kernel internal kuid, + * and returns that kuid. + * + * When there is no mapping defined for the user-namespace projid + * pair INVALID_PROJID is returned. Callers are expected to test + * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * may be tested for using projid_valid(). + */ +kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) +{ + /* Map the uid to a global kernel uid */ + return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); +} +EXPORT_SYMBOL(make_kprojid); + +/** + * from_kprojid - Create a projid from a kprojid user-namespace pair. + * @targ: The user namespace we want a projid in. + * @kprojid: The kernel internal project identifier to start with. + * + * Map @kprojid into the user-namespace specified by @targ and + * return the resulting projid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kprojid has no mapping in @targ (projid_t)-1 is returned. + */ +projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) +{ + /* Map the uid from a global kernel uid */ + return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); +} +EXPORT_SYMBOL(from_kprojid); + +/** + * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. + * @targ: The user namespace we want a projid in. + * @kprojid: The kernel internal projid to start with. + * + * Map @kprojid into the user-namespace specified by @targ and + * return the resulting projid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kprojid from_kprojid_munged never fails and always + * returns a valid projid. This makes from_kprojid_munged + * appropriate for use in syscalls like stat and where + * failing the system call and failing to provide a valid projid are + * not an options. + * + * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. + */ +projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) +{ + projid_t projid; + projid = from_kprojid(targ, kprojid); + + if (projid == (projid_t) -1) + projid = OVERFLOW_PROJID; + return projid; +} +EXPORT_SYMBOL(from_kprojid_munged); + + static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; @@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v) return 0; } +static int projid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + projid_t lower; + + lower_ns = seq_user_ns(seq); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; @@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos) return m_start(seq, ppos, &ns->gid_map); } +static void *projid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->projid_map); +} + static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; @@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = { .show = gid_m_show, }; +struct seq_operations proc_projid_seq_operations = { + .start = projid_m_start, + .stop = m_stop, + .next = m_next, + .show = projid_m_show, +}; + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Require the appropriate privilege CAP_SETUID or CAP_SETGID * over the user namespace in order to set the id mapping. */ - if (!ns_capable(ns, cap_setid)) + if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) goto out; /* Get a buffer */ @@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz &ns->gid_map, &ns->parent->gid_map); } +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); + + if (!ns->parent) + return -EPERM; + + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + + /* Anyone can set any valid project id no capability needed */ + return map_write(file, buf, size, ppos, -1, + &ns->projid_map, &ns->parent->projid_map); +} + static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow anyone to set a mapping that doesn't require privilege */ + if (!cap_valid(cap_setid)) + return true; + /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. */ -- cgit v1.2.2 From ea1abd6197d5805655da1bb589929762f4b4aa08 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 09:59:22 -0700 Subject: workqueue: reimplement idle worker rebinding Currently rebind_workers() uses rebinds idle workers synchronously before proceeding to requesting busy workers to rebind. This is necessary because all workers on @worker_pool->idle_list must be bound before concurrency management local wake-ups from the busy workers take place. Unfortunately, the synchronous idle rebinding is quite complicated. This patch reimplements idle rebinding to simplify the code path. Rather than trying to make all idle workers bound before rebinding busy workers, we simply remove all to-be-bound idle workers from the idle list and let them add themselves back after completing rebinding (successful or not). As only workers which finished rebinding can on on the idle worker list, the idle worker list is guaranteed to have only bound workers unless CPU went down again and local wake-ups are safe. After the change, @worker_pool->nr_idle may deviate than the actual number of idle workers on @worker_pool->idle_list. More specifically, nr_idle may be non-zero while ->idle_list is empty. All users of ->nr_idle and ->idle_list are audited. The only affected one is too_many_workers() which is updated to check %false if ->idle_list is empty regardless of ->nr_idle. After this patch, rebind_workers() no longer performs the nasty idle-rebind retries which require temporary release of gcwq->lock, and both unbinding and rebinding are atomic w.r.t. global_cwq->lock. worker->idle_rebind and global_cwq->rebind_hold are now unnecessary and removed along with the definition of struct idle_rebind. Changed from V1: 1) remove unlikely from too_many_workers(), ->idle_list can be empty anytime, even before this patch, no reason to use unlikely. 2) fix a small rebasing mistake. (which is from rebasing the orignal fixing patch to for-next) 3) add a lot of comments. 4) clear WORKER_REBIND unconditionaly in idle_worker_rebind() tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 141 ++++++++++++++++------------------------------------- 1 file changed, 42 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 31d8a4586d4c..770c1a8128bf 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,7 +126,6 @@ enum { struct global_cwq; struct worker_pool; -struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -150,7 +149,6 @@ struct worker { int id; /* I: worker id */ /* for rebinding worker to CPU */ - struct idle_rebind *idle_rebind; /* L: for idle worker */ struct work_struct rebind_work; /* L: for busy worker */ }; @@ -160,6 +158,8 @@ struct worker_pool { struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ + + /* nr_idle includes the ones off idle_list for rebinding */ int nr_idle; /* L: currently idle ones */ struct list_head idle_list; /* X: list of idle workers */ @@ -186,8 +186,6 @@ struct global_cwq { struct worker_pool pools[NR_WORKER_POOLS]; /* normal and highpri pools */ - - wait_queue_head_t rebind_hold; /* rebind hold wait */ } ____cacheline_aligned_in_smp; /* @@ -687,6 +685,13 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; + /* + * nr_idle and idle_list may disagree if idle rebinding is in + * progress. Never return %true if idle_list is empty. + */ + if (list_empty(&pool->idle_list)) + return false; + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -1611,37 +1616,26 @@ __acquires(&gcwq->lock) } } -struct idle_rebind { - int cnt; /* # workers to be rebound */ - struct completion done; /* all workers rebound */ -}; - /* - * Rebind an idle @worker to its CPU. During CPU onlining, this has to - * happen synchronously for idle workers. worker_thread() will test + * Rebind an idle @worker to its CPU. worker_thread() will test * %WORKER_REBIND before leaving idle and call this function. */ static void idle_worker_rebind(struct worker *worker) { struct global_cwq *gcwq = worker->pool->gcwq; - /* CPU must be online at this point */ - WARN_ON(!worker_maybe_bind_and_lock(worker)); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); - spin_unlock_irq(&worker->pool->gcwq->lock); - - /* we did our part, wait for rebind_workers() to finish up */ - wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); - /* - * rebind_workers() shouldn't finish until all workers passed the - * above WORKER_REBIND wait. Tell it when done. + * CPU may go down again inbetween. If rebinding fails, reinstate + * UNBOUND. We're off idle_list and nobody else can do it for us. */ - spin_lock_irq(&worker->pool->gcwq->lock); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); - spin_unlock_irq(&worker->pool->gcwq->lock); + if (!worker_maybe_bind_and_lock(worker)) + worker->flags |= WORKER_UNBOUND; + + worker_clr_flags(worker, WORKER_REBIND); + + /* rebind complete, become available again */ + list_add(&worker->entry, &worker->pool->idle_list); + spin_unlock_irq(&gcwq->lock); } /* @@ -1676,29 +1670,25 @@ static void busy_worker_rebind_fn(struct work_struct *work) * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding * is different for idle and busy ones. * - * The idle ones should be rebound synchronously and idle rebinding should - * be complete before any worker starts executing work items with - * concurrency management enabled; otherwise, scheduler may oops trying to - * wake up non-local idle worker from wq_worker_sleeping(). - * - * This is achieved by repeatedly requesting rebinding until all idle - * workers are known to have been rebound under @gcwq->lock and holding all - * idle workers from becoming busy until idle rebinding is complete. + * Idle ones will be removed from the idle_list and woken up. They will + * add themselves back after completing rebind. This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management. * - * Once idle workers are rebound, busy workers can be rebound as they - * finish executing their current work items. Queueing the rebind work at - * the head of their scheduled lists is enough. Note that nr_running will - * be properbly bumped as busy workers rebind. + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough. Note that nr_running will be properly bumped as busy workers + * rebind. * - * On return, all workers are guaranteed to either be bound or have rebind - * work item scheduled. + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case. Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe. */ static void rebind_workers(struct global_cwq *gcwq) - __releases(&gcwq->lock) __acquires(&gcwq->lock) { - struct idle_rebind idle_rebind; struct worker_pool *pool; - struct worker *worker; + struct worker *worker, *n; struct hlist_node *pos; int i; @@ -1707,46 +1697,29 @@ static void rebind_workers(struct global_cwq *gcwq) for_each_worker_pool(pool, gcwq) lockdep_assert_held(&pool->manager_mutex); - /* - * Rebind idle workers. Interlocked both ways. We wait for - * workers to rebind via @idle_rebind.done. Workers will wait for - * us to finish up by watching %WORKER_REBIND. - */ - init_completion(&idle_rebind.done); -retry: - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - - /* set REBIND and kick idle ones, we'll wait for these later */ + /* set REBIND and kick idle ones */ for_each_worker_pool(pool, gcwq) { - list_for_each_entry(worker, &pool->idle_list, entry) { + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { unsigned long worker_flags = worker->flags; - if (worker->flags & WORKER_REBIND) - continue; - /* morph UNBOUND to REBIND atomically */ worker_flags &= ~WORKER_UNBOUND; worker_flags |= WORKER_REBIND; ACCESS_ONCE(worker->flags) = worker_flags; - idle_rebind.cnt++; - worker->idle_rebind = &idle_rebind; + /* + * idle workers should be off @pool->idle_list + * until rebind is complete to avoid receiving + * premature local wake-ups. + */ + list_del_init(&worker->entry); /* worker_thread() will call idle_worker_rebind() */ wake_up_process(worker->task); } } - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); - /* busy ones might have become idle while waiting, retry */ - goto retry; - } - - /* all idle workers are rebound, rebind busy workers */ + /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { unsigned long worker_flags = worker->flags; struct work_struct *rebind_work = &worker->rebind_work; @@ -1776,34 +1749,6 @@ retry: worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } - - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - * - * We need to make sure all idle workers passed WORKER_REBIND wait - * in idle_worker_rebind() before returning; otherwise, workers can - * get stuck at the wait if hotplug cycle repeats. - */ - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - - for_each_worker_pool(pool, gcwq) { - list_for_each_entry(worker, &pool->idle_list, entry) { - worker->flags &= ~WORKER_REBIND; - idle_rebind.cnt++; - } - } - - wake_up_all(&gcwq->rebind_hold); - - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); - } } static struct worker *alloc_worker(void) @@ -3916,8 +3861,6 @@ static int __init init_workqueues(void) mutex_init(&pool->manager_mutex); ida_init(&pool->worker_ida); } - - init_waitqueue_head(&gcwq->rebind_hold); } /* create the initial worker */ -- cgit v1.2.2 From eab6d82843ee1df244f8847d1bf8bb89160ec4aa Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 09:59:22 -0700 Subject: workqueue: WORKER_REBIND is no longer necessary for busy rebinding Because the old unbind/rebinding implementation wasn't atomic w.r.t. GCWQ_DISASSOCIATED manipulation which is protected by global_cwq->lock, we had to use two flags, WORKER_UNBOUND and WORKER_REBIND, to avoid incorrectly losing all NOT_RUNNING bits with back-to-back CPU hotplug operations; otherwise, completion of rebinding while another unbinding is in progress could clear UNBIND prematurely. Now that both unbind/rebinding are atomic w.r.t. GCWQ_DISASSOCIATED, there's no need to use two flags. Just one is enough. Don't use WORKER_REBIND for busy rebinding. tj: Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 770c1a8128bf..794724efb733 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1649,16 +1649,8 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - worker_maybe_bind_and_lock(worker); - - /* - * %WORKER_REBIND must be cleared even if the above binding failed; - * otherwise, we may confuse the next CPU_UP cycle or oops / get - * stuck by calling idle_worker_rebind() prematurely. If CPU went - * down again inbetween, %WORKER_UNBOUND would be set, so clearing - * %WORKER_REBIND is always safe. - */ - worker_clr_flags(worker, WORKER_REBIND); + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); spin_unlock_irq(&gcwq->lock); } @@ -1721,15 +1713,9 @@ static void rebind_workers(struct global_cwq *gcwq) /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { - unsigned long worker_flags = worker->flags; struct work_struct *rebind_work = &worker->rebind_work; struct workqueue_struct *wq; - /* morph UNBOUND to REBIND atomically */ - worker_flags &= ~WORKER_UNBOUND; - worker_flags |= WORKER_REBIND; - ACCESS_ONCE(worker->flags) = worker_flags; - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) continue; -- cgit v1.2.2 From 5f7dabfd5cb115937afb4649e4c73b02f927f6ae Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 09:59:23 -0700 Subject: workqueue: WORKER_REBIND is no longer necessary for idle rebinding Now both worker destruction and idle rebinding remove the worker from idle list while it's still idle, so list_empty(&worker->entry) can be used to test whether either is pending and WORKER_DIE to distinguish between the two instead making WORKER_REBIND unnecessary. Use list_empty(&worker->entry) to determine whether destruction or rebinding is pending. This simplifies worker state transitions. WORKER_REBIND is not needed anymore. Remove it. tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 794724efb733..cdc6bfc84b78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,11 +73,10 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | WORKER_CPU_INTENSIVE, NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ @@ -1618,20 +1617,15 @@ __acquires(&gcwq->lock) /* * Rebind an idle @worker to its CPU. worker_thread() will test - * %WORKER_REBIND before leaving idle and call this function. + * list_empty(@worker->entry) before leaving idle and call this function. */ static void idle_worker_rebind(struct worker *worker) { struct global_cwq *gcwq = worker->pool->gcwq; - /* - * CPU may go down again inbetween. If rebinding fails, reinstate - * UNBOUND. We're off idle_list and nobody else can do it for us. - */ - if (!worker_maybe_bind_and_lock(worker)) - worker->flags |= WORKER_UNBOUND; - - worker_clr_flags(worker, WORKER_REBIND); + /* CPU may go down again inbetween, clear UNBOUND only on success */ + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); /* rebind complete, become available again */ list_add(&worker->entry, &worker->pool->idle_list); @@ -1689,16 +1683,9 @@ static void rebind_workers(struct global_cwq *gcwq) for_each_worker_pool(pool, gcwq) lockdep_assert_held(&pool->manager_mutex); - /* set REBIND and kick idle ones */ + /* dequeue and kick idle ones */ for_each_worker_pool(pool, gcwq) { list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - unsigned long worker_flags = worker->flags; - - /* morph UNBOUND to REBIND atomically */ - worker_flags &= ~WORKER_UNBOUND; - worker_flags |= WORKER_REBIND; - ACCESS_ONCE(worker->flags) = worker_flags; - /* * idle workers should be off @pool->idle_list * until rebind is complete to avoid receiving @@ -1706,7 +1693,10 @@ static void rebind_workers(struct global_cwq *gcwq) */ list_del_init(&worker->entry); - /* worker_thread() will call idle_worker_rebind() */ + /* + * worker_thread() will see the above dequeuing + * and call idle_worker_rebind(). + */ wake_up_process(worker->task); } } @@ -2176,7 +2166,7 @@ __acquires(&gcwq->lock) * necessary to avoid spurious warnings from rescuers servicing the * unbound or a disassociated gcwq. */ - WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && !(gcwq->flags & GCWQ_DISASSOCIATED) && raw_smp_processor_id() != gcwq->cpu); @@ -2300,18 +2290,17 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&gcwq->lock); - /* - * DIE can be set only while idle and REBIND set while busy has - * @worker->rebind_work scheduled. Checking here is enough. - */ - if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { + /* we are off idle list if destruction or rebind is requested */ + if (unlikely(list_empty(&worker->entry))) { spin_unlock_irq(&gcwq->lock); + /* if DIE is set, destruction is requested */ if (worker->flags & WORKER_DIE) { worker->task->flags &= ~PF_WQ_WORKER; return 0; } + /* otherwise, rebind */ idle_worker_rebind(worker); goto woke_up; } -- cgit v1.2.2 From b2eb83d123c1cc9f96a8e452b26a6ebe631b3ad7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 09:59:23 -0700 Subject: workqueue: rename manager_mutex to assoc_mutex Now that manager_mutex's role has changed from synchronizing manager role to excluding hotplug against manager, the name is misleading. As it is protecting the CPU-association of the gcwq now, rename it to assoc_mutex. This patch is pure rename and doesn't introduce any functional change. tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cdc6bfc84b78..e651239f1ece 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -58,7 +58,7 @@ enum { * be executing on any CPU. The gcwq behaves as an unbound one. * * Note that DISASSOCIATED can be flipped only while holding - * managership of all pools on the gcwq to avoid changing binding + * assoc_mutex of all pools on the gcwq to avoid changing binding * state while create_worker() is in progress. */ GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ @@ -165,7 +165,7 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - struct mutex manager_mutex; /* mutex manager should hold */ + struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ }; @@ -1681,7 +1681,7 @@ static void rebind_workers(struct global_cwq *gcwq) lockdep_assert_held(&gcwq->lock); for_each_worker_pool(pool, gcwq) - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->assoc_mutex); /* dequeue and kick idle ones */ for_each_worker_pool(pool, gcwq) { @@ -2081,22 +2081,22 @@ static bool manage_workers(struct worker *worker) * grab %POOL_MANAGING_WORKERS to achieve this because that can * lead to idle worker depletion (all become busy thinking someone * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->manager_mutex to synchronize + * extreme circumstances. Use @pool->assoc_mutex to synchronize * manager against CPU hotplug. * - * manager_mutex would always be free unless CPU hotplug is in + * assoc_mutex would always be free unless CPU hotplug is in * progress. trylock first without dropping @gcwq->lock. */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { spin_unlock_irq(&pool->gcwq->lock); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->assoc_mutex); /* * CPU hotplug could have happened while we were waiting - * for manager_mutex. Hotplug itself can't handle us + * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and * @gcwq's state and ours could have deviated. * - * As hotplug is now excluded via manager_mutex, we can + * As hotplug is now excluded via assoc_mutex, we can * simply try to bind. It will succeed or fail depending * on @gcwq's current state. Try it and adjust * %WORKER_UNBOUND accordingly. @@ -2119,7 +2119,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_create_worker(pool); pool->flags &= ~POOL_MANAGING_WORKERS; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->assoc_mutex); return ret; } @@ -3474,23 +3474,23 @@ EXPORT_SYMBOL_GPL(work_busy); */ /* claim manager positions of all pools */ -static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) +static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) { struct worker_pool *pool; for_each_worker_pool(pool, gcwq) - mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); + mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); spin_lock_irq(&gcwq->lock); } /* release manager positions */ -static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) { struct worker_pool *pool; spin_unlock_irq(&gcwq->lock); for_each_worker_pool(pool, gcwq) - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->assoc_mutex); } static void gcwq_unbind_fn(struct work_struct *work) @@ -3503,7 +3503,7 @@ static void gcwq_unbind_fn(struct work_struct *work) BUG_ON(gcwq->cpu != smp_processor_id()); - gcwq_claim_management_and_lock(gcwq); + gcwq_claim_assoc_and_lock(gcwq); /* * We've claimed all manager positions. Make all workers unbound @@ -3520,7 +3520,7 @@ static void gcwq_unbind_fn(struct work_struct *work) gcwq->flags |= GCWQ_DISASSOCIATED; - gcwq_release_management_and_unlock(gcwq); + gcwq_release_assoc_and_unlock(gcwq); /* * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3576,10 +3576,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - gcwq_claim_management_and_lock(gcwq); + gcwq_claim_assoc_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; rebind_workers(gcwq); - gcwq_release_management_and_unlock(gcwq); + gcwq_release_assoc_and_unlock(gcwq); break; } return NOTIFY_OK; @@ -3833,7 +3833,7 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_mutex); + mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); } } -- cgit v1.2.2 From 9fdf9b73d61c87a9c16f101bb8bbe069d13046f5 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 09:59:23 -0700 Subject: workqueue: use __cpuinit instead of __devinit for cpu callbacks For workqueue hotplug callbacks, it makes less sense to use __devinit which discards the memory after boot if !HOTPLUG. __cpuinit, which discards the memory after boot if !HOTPLUG_CPU fits better. tj: Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e651239f1ece..942bb750a650 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3548,7 +3548,7 @@ static void gcwq_unbind_fn(struct work_struct *work) * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -3589,7 +3589,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ -static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { -- cgit v1.2.2 From a5b4e57d7cc07cb28ccf16de0876a4770ae84920 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 09:59:23 -0700 Subject: workqueue: use hotcpu_notifier() for workqueue_cpu_down_callback() workqueue_cpu_down_callback() is used only if HOTPLUG_CPU=y, so hotcpu_notifier() fits better than cpu_notifier(). When HOTPLUG_CPU=y, hotcpu_notifier() and cpu_notifier() are the same. When HOTPLUG_CPU=n, if we use cpu_notifier(), workqueue_cpu_down_callback() will be called during boot to do nothing, and the memory of workqueue_cpu_down_callback() and gcwq_unbind_fn() will be discarded after boot. If we use hotcpu_notifier(), we can avoid the no-op call of workqueue_cpu_down_callback() and the memory of workqueue_cpu_down_callback() and gcwq_unbind_fn() will be discard at build time: $ ls -l kernel/workqueue.o.cpu_notifier kernel/workqueue.o.hotcpu_notifier -rw-rw-r-- 1 laijs laijs 484080 Sep 15 11:31 kernel/workqueue.o.cpu_notifier -rw-rw-r-- 1 laijs laijs 478240 Sep 15 11:31 kernel/workqueue.o.hotcpu_notifier $ size kernel/workqueue.o.cpu_notifier kernel/workqueue.o.hotcpu_notifier text data bss dec hex filename 18513 2387 1221 22121 5669 kernel/workqueue.o.cpu_notifier 18082 2355 1221 21658 549a kernel/workqueue.o.hotcpu_notifier tj: Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 942bb750a650..48becaba1c94 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3807,7 +3807,7 @@ static int __init init_workqueues(void) WORK_CPU_LAST); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); - cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { -- cgit v1.2.2 From 3aa62497594430ea522050b75c033f71f2c60ee6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 10:40:00 -0700 Subject: workqueue: fix possible stall on try_to_grab_pending() of a delayed work item Currently, when try_to_grab_pending() grabs a delayed work item, it leaves its linked work items alone on the delayed_works. The linked work items are always NO_COLOR and will cause future cwq_activate_first_delayed() increase cwq->nr_active incorrectly, and may cause the whole cwq to stall. For example, state: cwq->max_active = 1, cwq->nr_active = 1 one work in cwq->pool, many in cwq->delayed_works. step1: try_to_grab_pending() removes a work item from delayed_works but leaves its NO_COLOR linked work items on it. step2: Later on, cwq_activate_first_delayed() activates the linked work item increasing ->nr_active. step3: cwq->nr_active = 1, but all activated work items of the cwq are NO_COLOR. When they finish, cwq->nr_active will not be decreased due to NO_COLOR, and no further work items will be activated from cwq->delayed_works. the cwq stalls. Fix it by ensuring the target work item is activated before stealing PENDING in try_to_grab_pending(). This ensures that all the linked work items are activated without incorrectly bumping cwq->nr_active. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@kernel.org --- kernel/workqueue.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 48becaba1c94..d2fe8e77ceb7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -977,10 +977,9 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +static void cwq_activate_delayed_work(struct work_struct *work) { - struct work_struct *work = list_first_entry(&cwq->delayed_works, - struct work_struct, entry); + struct cpu_workqueue_struct *cwq = get_work_cwq(work); trace_workqueue_activate_work(work); move_linked_works(work, &cwq->pool->worklist, NULL); @@ -988,6 +987,14 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) cwq->nr_active++; } +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + + cwq_activate_delayed_work(work); +} + /** * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight * @cwq: cwq of interest @@ -1106,6 +1113,18 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, smp_rmb(); if (gcwq == get_work_gcwq(work)) { debug_work_deactivate(work); + + /* + * A delayed work item cannot be grabbed directly + * because it might have linked NO_COLOR work items + * which, if left on the delayed_list, will confuse + * cwq->nr_active management later on and cause + * stall. Make sure the work item is activated + * before grabbing. + */ + if (*work_data_bits(work) & WORK_STRUCT_DELAYED) + cwq_activate_delayed_work(work); + list_del_init(&work->entry); cwq_dec_nr_in_flight(get_work_cwq(work), get_work_color(work), -- cgit v1.2.2 From b3f9f405a21a29c06c31fb2d6ab36ef9ba7c027b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Sep 2012 10:40:00 -0700 Subject: workqueue: remove @delayed from cwq_dec_nr_in_flight() @delayed is now always false for all callers, remove it. tj: Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d2fe8e77ceb7..3e324aae3c98 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -999,7 +999,6 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight * @cwq: cwq of interest * @color: color of work which left the queue - * @delayed: for a delayed work * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its cwq and handle workqueue flushing. @@ -1007,8 +1006,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, - bool delayed) +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) { /* ignore uncolored works */ if (color == WORK_NO_COLOR) @@ -1016,13 +1014,11 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, cwq->nr_in_flight[color]--; - if (!delayed) { - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); } /* is flush in progress and are we at the flushing tip? */ @@ -1127,8 +1123,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, list_del_init(&work->entry); cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work), - *work_data_bits(work) & WORK_STRUCT_DELAYED); + get_work_color(work)); spin_unlock(&gcwq->lock); return 1; @@ -2264,7 +2259,7 @@ __acquires(&gcwq->lock) hlist_del_init(&worker->hentry); worker->current_work = NULL; worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color, false); + cwq_dec_nr_in_flight(cwq, work_color); } /** -- cgit v1.2.2 From ed48ece27cd3d5ee0354c32bbaec0f3e1d4715c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Sep 2012 12:48:43 -0700 Subject: workqueue: reimplement work_on_cpu() using system_wq The existing work_on_cpu() implementation is hugely inefficient. It creates a new kthread, execute that single function and then let the kthread die on each invocation. Now that system_wq can handle concurrent executions, there's no advantage of doing this. Reimplement work_on_cpu() using system_wq which makes it simpler and way more efficient. stable: While this isn't a fix in itself, it's needed to fix a workqueue related bug in cpufreq/powernow-k8. AFAICS, this shouldn't break other existing users. Signed-off-by: Tejun Heo Acked-by: Jiri Kosina Cc: Linus Torvalds Cc: Bjorn Helgaas Cc: Len Brown Cc: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b80065a2450a..3c5a79e2134c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3576,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3602,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.2.2 From 9f4bd4cddbb50d7617353102e10ce511c5ef6df2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 19 Sep 2012 10:40:48 -0700 Subject: workqueue: introduce cwq_set_max_active() helper for thaw_workqueues() Using a helper instead of open code makes thaw_workqueues() clearer. The helper will also be used by the next patch. tj: Slight update to comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3e324aae3c98..b5d722548ffd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3366,6 +3366,26 @@ void destroy_workqueue(struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(destroy_workqueue); +/** + * cwq_set_max_active - adjust max_active of a cwq + * @cwq: target cpu_workqueue_struct + * @max_active: new max_active value. + * + * Set @cwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +{ + cwq->max_active = max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); +} + /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue @@ -3792,11 +3812,7 @@ void thaw_workqueues(void) continue; /* restore max_active and repopulate worklist */ - cwq->max_active = wq->saved_max_active; - - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + cwq_set_max_active(cwq, wq->saved_max_active); } for_each_worker_pool(pool, gcwq) -- cgit v1.2.2 From 70369b117a8fc5ac18a635ced23ee49f8e722e7b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 19 Sep 2012 10:40:48 -0700 Subject: workqueue: use cwq_set_max_active() helper for workqueue_set_max_active() workqueue_set_max_active() may increase ->max_active without activating delayed works and may make the activation order differ from the queueing order. Both aren't strictly bugs but the resulting behavior could be a bit odd. To make things more consistent, use cwq_set_max_active() helper which immediately makes use of the newly increased max_mactive if there are delayed work items and also keeps the activation order. tj: Slight update to description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b5d722548ffd..4f5c61f8b0e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3413,7 +3413,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) if (!(wq->flags & WQ_FREEZABLE) || !(gcwq->flags & GCWQ_FREEZING)) - get_cwq(gcwq->cpu, wq)->max_active = max_active; + cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); spin_unlock_irq(&gcwq->lock); } -- cgit v1.2.2 From 7c6e72e46c9ea4a88f3f8ba96edce9db4bd48726 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Sep 2012 10:03:19 -0700 Subject: workqueue: remove spurious WARN_ON_ONCE(in_irq()) from try_to_grab_pending() e0aecdd874 ("workqueue: use irqsafe timer for delayed_work") made try_to_grab_pending() safe to use from irq context but forgot to remove WARN_ON_ONCE(in_irq()). Remove it. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4f5c61f8b0e7..143fd8c751f4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1070,8 +1070,6 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, { struct global_cwq *gcwq; - WARN_ON_ONCE(in_irq()); - local_irq_save(*flags); /* try to steal the timer if it exists */ -- cgit v1.2.2 From a10d206ef1a83121ab7430cb196e0376a7145b22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Sep 2012 13:55:30 -0700 Subject: rcu: Fix day-one dyntick-idle stall-warning bug Each grace period is supposed to have at least one callback waiting for that grace period to complete. However, if CONFIG_NO_HZ=n, an extra callback-free grace period is no big problem -- it will chew up a tiny bit of CPU time, but it will complete normally. In contrast, CONFIG_NO_HZ=y kernels have the potential for all the CPUs to go to sleep indefinitely, in turn indefinitely delaying completion of the callback-free grace period. Given that nothing is waiting on this grace period, this is also not a problem. That is, unless RCU CPU stall warnings are also enabled, as they are in recent kernels. In this case, if a CPU wakes up after at least one minute of inactivity, an RCU CPU stall warning will result. The reason that no one noticed until quite recently is that most systems have enough OS noise that they will never remain absolutely idle for a full minute. But there are some embedded systems with cut-down userspace configurations that consistently get into this situation. All this begs the question of exactly how a callback-free grace period gets started in the first place. This can happen due to the fact that CPUs do not necessarily agree on which grace period is in progress. If a CPU still believes that the grace period that just completed is still ongoing, it will believe that it has callbacks that need to wait for another grace period, never mind the fact that the grace period that they were waiting for just completed. This CPU can therefore erroneously decide to start a new grace period. Note that this can happen in TREE_RCU and TREE_PREEMPT_RCU even on a single-CPU system: Deadlock considerations mean that the CPU that detected the end of the grace period is not necessarily officially informed of this fact for some time. Once this CPU notices that the earlier grace period completed, it will invoke its callbacks. It then won't have any callbacks left. If no other CPU has any callbacks, we now have a callback-free grace period. This commit therefore makes CPUs check more carefully before starting a new grace period. This new check relies on an array of tail pointers into each CPU's list of callbacks. If the CPU is up to date on which grace periods have completed, it checks to see if any callbacks follow the RCU_DONE_TAIL segment, otherwise it checks to see if any callbacks follow the RCU_WAIT_TAIL segment. The reason that this works is that the RCU_WAIT_TAIL segment will be promoted to the RCU_DONE_TAIL segment as soon as the CPU is officially notified that the old grace period has ended. This change is to cpu_needs_another_gp(), which is called in a number of places. The only one that really matters is in rcu_start_gp(), where the root rcu_node structure's ->lock is held, which prevents any other CPU from starting or completing a grace period, so that the comparison that determines whether the CPU is missing the completion of a grace period is stable. Reported-by: Becky Bruce Reported-by: Subodh Nijsure Reported-by: Paul Walmsley Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Paul Walmsley # OMAP3730, OMAP4430 Cc: stable@vger.kernel.org --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..f7bcd9e6c054 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -305,7 +305,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); + return *rdp->nxttail[RCU_DONE_TAIL + + ACCESS_ONCE(rsp->completed) != rdp->completed] && + !rcu_gp_in_progress(rsp); } /* -- cgit v1.2.2 From b3dbec76e5334fbb063987dea14e7b255602d7e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Jun 2012 18:36:08 -0700 Subject: rcu: Move RCU grace-period initialization into a kthread As the first step towards allowing grace-period initialization to be preemptible, this commit moves the RCU grace-period initialization into its own kthread. This is needed to keep large-system scheduling latency at reasonable levels. Also change raw_spin_lock_irqsave() to raw_spin_lock_irq() as suggested by Peter Zijlstra in review comments. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 190 ++++++++++++++++++++++++++++++++++++------------------- kernel/rcutree.h | 3 + 2 files changed, 129 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..4792f1642bf2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1041,6 +1041,102 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat __note_new_gpnum(rsp, rnp, rdp); } +/* + * Body of kthread that handles grace periods. + */ +static int rcu_gp_kthread(void *arg) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = arg; + + for (;;) { + + /* Handle grace-period start. */ + rnp = rcu_get_root(rsp); + for (;;) { + wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); + if (rsp->gp_flags) + break; + flush_signals(current); + } + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags = 0; + rdp = this_cpu_ptr(rsp->rda); + + if (rcu_gp_in_progress(rsp)) { + /* + * A grace period is already in progress, so + * don't start another one. + */ + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + if (rsp->fqs_active) { + /* + * We need a grace period, but force_quiescent_state() + * is running. Tell it to start one on our behalf. + */ + rsp->fqs_need_gp = 1; + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + /* Advance to a new grace period and initialize state. */ + rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + record_gp_stall_check_time(rsp); + raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + + /* Exclude any concurrent CPU-hotplug operations. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + + /* + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. + * This operation relies on the layout of the hierarchy + * within the rsp->node[] array. Note that other CPUs will + * access only the leaves of the hierarchy, which still + * indicate that no grace period is in progress, at least + * until the corresponding leaf node has been initialized. + * In addition, we have excluded CPU-hotplug operations. + * + * Note that the grace period cannot complete until + * we finish the initialization process, as there will + * be at least one qsmask bit set in the root node until + * that time, namely the one corresponding to this CPU, + * due to the fact that we have irqs disabled. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + /* force_quiescent_state() now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rsp->onofflock); + } + return 0; +} + /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1058,77 +1154,20 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_scheduler_fully_active || + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* - * Either the scheduler hasn't yet spawned the first - * non-idle task or this CPU does not need another - * grace period. Either way, don't start a new grace - * period. + * Either we have not yet spawned the grace-period + * task or this CPU does not need another grace period. + * Either way, don't start a new grace period. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - if (rsp->fqs_active) { - /* - * This CPU needs a grace period, but force_quiescent_state() - * is running. Tell it to start one on this CPU's behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Advance to a new grace period and initialize state. */ - rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - record_gp_stall_check_time(rsp); - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ - - /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - - /* - * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. This - * operation relies on the layout of the hierarchy within the - * rsp->node[] array. Note that other CPUs will access only - * the leaves of the hierarchy, which still indicate that no - * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. - * - * Note that the grace period cannot complete until we finish - * the initialization process, as there will be at least one - * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU, due to the fact that we have - * irqs disabled. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + rsp->gp_flags = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up(&rsp->gp_wq); } /* @@ -2628,6 +2667,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } +/* + * Spawn the kthread that handles this RCU flavor's grace periods. + */ +static int __init rcu_spawn_gp_kthread(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp; + struct task_struct *t; + + for_each_rcu_flavor(rsp) { + t = kthread_run(rcu_gp_kthread, rsp, rsp->name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + rsp->gp_kthread = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + return 0; +} +early_initcall(rcu_spawn_gp_kthread); + /* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain @@ -2729,6 +2790,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; + init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..117a15019e99 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -385,6 +385,9 @@ struct rcu_state { u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ + struct task_struct *gp_kthread; /* Task for grace periods. */ + wait_queue_head_t gp_wq; /* Where GP task waits. */ + int gp_flags; /* Commands for GP task. */ /* End of fields guarded by root rcu_node's lock. */ -- cgit v1.2.2 From 79bce6724366b3827c5c673fb07d7063082873cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Sep 2012 14:32:58 -0700 Subject: rcu: Prevent initialization-time quiescent-state race The next step in reducing RCU's grace-period initialization latency on large systems will make this initialization preemptible. Unfortunately, making the grace-period initialization subject to interrupts (let alone preemption) exposes the following race on systems whose rcu_node tree contains more than one node: 1. CPU 31 starts initializing the grace period, including the first leaf rcu_node structures, and is then preempted. 2. CPU 0 refers to the first leaf rcu_node structure, and notes that a new grace period has started. It passes through a quiescent state shortly thereafter, and informs the RCU core of this rite of passage. 3. CPU 0 enters an RCU read-side critical section, acquiring a pointer to an RCU-protected data item. 4. CPU 31 takes an interrupt whose handler removes the data item referenced by CPU 0 from the data structure, and registers an RCU callback in order to free it. 5. CPU 31 resumes initializing the grace period, including its own rcu_node structure. In invokes rcu_start_gp_per_cpu(), which advances all callbacks, including the one registered in #4 above, to be handled by the current grace period. 6. The remaining CPUs pass through quiescent states and inform the RCU core, but CPU 0 remains in its RCU read-side critical section, still referencing the now-removed data item. 7. The grace period completes and all the callbacks are invoked, including the one that frees the data item that CPU 0 is still referencing. Oops!!! One way to avoid this race is to remove grace-period acceleration from rcu_start_gp_per_cpu(). Now, the only reason for this acceleration was to allow CPUs bringing RCU out of idle state to have their callbacks invoked after only one grace period, rather than the two grace periods that would otherwise be required. But this acceleration does not work when RCU grace-period initialization is moved to a kthread because the CPU posting the callback is no longer necessarily the CPU that is initializing the resulting grace period. This commit therefore removes this now-pointless (and soon to be dangerous) grace-period acceleration, thus avoiding the above race. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4792f1642bf2..e7a534498aa0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1023,20 +1023,6 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Prior grace period ended, so advance callbacks for current CPU. */ __rcu_process_gp_end(rsp, rnp, rdp); - /* - * Because this CPU just now started the new grace period, we know - * that all of its callbacks will be covered by this upcoming grace - * period, even the ones that were registered arbitrarily recently. - * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. - * - * Other CPUs cannot be sure exactly when the grace period started. - * Therefore, their recently registered callbacks must pass through - * an additional RCU_NEXT_READY stage, so that they will be handled - * by the next RCU grace period. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - /* Set state so that this CPU will detect the next quiescent state. */ __note_new_gpnum(rsp, rnp, rdp); } -- cgit v1.2.2 From 755609a9087fa983f567dc5452b2fa7b089b591f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2012 17:18:20 -0700 Subject: rcu: Allow RCU grace-period initialization to be preempted RCU grace-period initialization is currently carried out with interrupts disabled, which can result in 200-microsecond latency spikes on systems on which RCU has been configured for 4096 CPUs. This patch therefore makes the RCU grace-period initialization be preemptible, which should eliminate those latency spikes. Similar spikes from grace-period cleanup and the forcing of quiescent states will be dealt with similarly by later patches. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e7a534498aa0..781e5f0b7b17 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1030,7 +1030,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* * Body of kthread that handles grace periods. */ -static int rcu_gp_kthread(void *arg) +static int __noreturn rcu_gp_kthread(void *arg) { struct rcu_data *rdp; struct rcu_node *rnp; @@ -1056,6 +1056,7 @@ static int rcu_gp_kthread(void *arg) * don't start another one. */ raw_spin_unlock_irq(&rnp->lock); + cond_resched(); continue; } @@ -1066,6 +1067,7 @@ static int rcu_gp_kthread(void *arg) */ rsp->fqs_need_gp = 1; raw_spin_unlock_irq(&rnp->lock); + cond_resched(); continue; } @@ -1076,10 +1078,10 @@ static int rcu_gp_kthread(void *arg) rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + get_online_cpus(); /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1091,15 +1093,9 @@ static int rcu_gp_kthread(void *arg) * indicate that no grace period is in progress, at least * until the corresponding leaf node has been initialized. * In addition, we have excluded CPU-hotplug operations. - * - * Note that the grace period cannot complete until - * we finish the initialization process, as there will - * be at least one qsmask bit set in the root node until - * that time, namely the one corresponding to this CPU, - * due to the fact that we have irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; @@ -1110,17 +1106,17 @@ static int rcu_gp_kthread(void *arg) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); /* force_quiescent_state() now OK. */ rsp->fqs_state = RCU_SIGNAL_INIT; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irq(&rsp->onofflock); + raw_spin_unlock_irq(&rnp->lock); + put_online_cpus(); } - return 0; } /* -- cgit v1.2.2 From cabc49c1ff51baaf1958d501a7a616ce91245c93 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Jun 2012 17:07:14 -0700 Subject: rcu: Move RCU grace-period cleanup into kthread As a first step towards allowing grace-period cleanup to be preemptible, this commit moves the RCU grace-period cleanup into the same kthread that is now used to initialize grace periods. This is needed to keep scheduling latency down to a dull roar. [ paulmck: Get rid of stray spin_lock_irqsave() calls. ] Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 112 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 62 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 781e5f0b7b17..52c3102dc5f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1032,6 +1032,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat */ static int __noreturn rcu_gp_kthread(void *arg) { + unsigned long gp_duration; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp = arg; @@ -1116,6 +1117,65 @@ static int __noreturn rcu_gp_kthread(void *arg) rsp->fqs_state = RCU_SIGNAL_INIT; raw_spin_unlock_irq(&rnp->lock); put_online_cpus(); + + /* Handle grace-period end. */ + rnp = rcu_get_root(rsp); + for (;;) { + wait_event_interruptible(rsp->gp_wq, + !ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)); + if (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + flush_signals(current); + } + + raw_spin_lock_irq(&rnp->lock); + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; + + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node + * structures so that other CPUs don't have to + * wait until the start of the next grace period + * to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); + rnp->completed = rsp->gpnum; + /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); + } + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + } + + rsp->completed = rsp->gpnum; /* Declare grace period done. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + rsp->fqs_state = RCU_GP_IDLE; + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; + raw_spin_unlock_irq(&rnp->lock); } } @@ -1162,57 +1222,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - unsigned long gp_duration; - struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - - /* - * Ensure that all grace-period and pre-grace-period activity - * is seen before the assignment to rsp->completed. - */ - smp_mb(); /* See above block comment. */ - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; - - /* - * We know the grace period is complete, but to everyone else - * it appears to still be ongoing. But it is also the case - * that to everyone else it looks like there is nothing that - * they can do to advance the grace period. It is therefore - * safe for us to drop the lock in order to mark the grace - * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. - */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - - /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->gpnum; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - } - - rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* -- cgit v1.2.2 From c856bafae7f5b3f59ac1d99279a9b99b3b36ad12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 08:19:05 -0700 Subject: rcu: Allow RCU grace-period cleanup to be preempted RCU grace-period cleanup is currently carried out with interrupts disabled, which can result in excessive latency spikes on large systems (many hundreds or thousands of CPUs). This patch therefore makes the RCU grace-period cleanup be preemptible, including voluntary preemption points, which should eliminate those latency spikes. Similar spikes from forcing of quiescent states will be dealt with similarly by later patches. Updated to replace uses of spin_lock_irqsave() with spin_lock_irq(), as suggested by Peter Zijlstra. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 52c3102dc5f7..ddc6acc85d26 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1151,7 +1151,7 @@ static int __noreturn rcu_gp_kthread(void *arg) * completed. */ if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); /* * Propagate new ->completed value to rcu_node @@ -1160,14 +1160,13 @@ static int __noreturn rcu_gp_kthread(void *arg) * to process their callbacks. */ rcu_for_each_node_breadth_first(rsp, rnp) { - /* irqs already disabled. */ - raw_spin_lock(&rnp->lock); + raw_spin_lock_irq(&rnp->lock); rnp->completed = rsp->gpnum; - /* irqs remain disabled. */ - raw_spin_unlock(&rnp->lock); + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); } rsp->completed = rsp->gpnum; /* Declare grace period done. */ -- cgit v1.2.2 From 7fdefc10e1d730d4608cc59d386bc446f5b9ee99 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Jun 2012 11:08:41 -0700 Subject: rcu: Break up rcu_gp_kthread() into subfunctions Then rcu_gp_kthread() function is too large and furthermore needs to have the force_quiescent_state() code pulled in. This commit therefore breaks up rcu_gp_kthread() into rcu_gp_init() and rcu_gp_cleanup(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 250 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 135 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ddc6acc85d26..f0c0c1b4b6d4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1028,95 +1028,159 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat } /* - * Body of kthread that handles grace periods. + * Initialize a new grace period. */ -static int __noreturn rcu_gp_kthread(void *arg) +static int rcu_gp_init(struct rcu_state *rsp) { - unsigned long gp_duration; struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); - for (;;) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags = 0; - /* Handle grace-period start. */ - rnp = rcu_get_root(rsp); - for (;;) { - wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); - if (rsp->gp_flags) - break; - flush_signals(current); - } + if (rcu_gp_in_progress(rsp)) { + /* Grace period already in progress, don't start another. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; + } + + if (rsp->fqs_active) { + /* + * We need a grace period, but force_quiescent_state() + * is running. Tell it to start one on our behalf. + */ + rsp->fqs_need_gp = 1; + raw_spin_unlock_irq(&rnp->lock); + return 0; + } + + /* Advance to a new grace period and initialize state. */ + rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + record_gp_stall_check_time(rsp); + raw_spin_unlock_irq(&rnp->lock); + + /* Exclude any concurrent CPU-hotplug operations. */ + get_online_cpus(); + + /* + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first order, + * starting from the root rcu_node structure, relying on the layout + * of the tree within the rsp->node[] array. Note that other CPUs + * will access only the leaves of the hierarchy, thus seeing that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. + * + * The grace period cannot complete until the initialization + * process finishes, because this kthread handles both. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags = 0; rdp = this_cpu_ptr(rsp->rda); + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); + } - if (rcu_gp_in_progress(rsp)) { - /* - * A grace period is already in progress, so - * don't start another one. - */ - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - continue; - } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); + /* force_quiescent_state() now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; + raw_spin_unlock_irq(&rnp->lock); + put_online_cpus(); + return 1; +} - if (rsp->fqs_active) { - /* - * We need a grace period, but force_quiescent_state() - * is running. Tell it to start one on our behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - continue; - } +/* + * Clean up after the old grace period. + */ +static int rcu_gp_cleanup(struct rcu_state *rsp) +{ + unsigned long gp_duration; + struct rcu_data *rdp; + struct rcu_node *rnp = rcu_get_root(rsp); - /* Advance to a new grace period and initialize state. */ - rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - record_gp_stall_check_time(rsp); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_lock_irq(&rnp->lock); + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; - /* Exclude any concurrent CPU-hotplug operations. */ - get_online_cpus(); + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + rdp = this_cpu_ptr(rsp->rda); + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock_irq(&rnp->lock); /* - * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. - * This operation relies on the layout of the hierarchy - * within the rsp->node[] array. Note that other CPUs will - * access only the leaves of the hierarchy, which still - * indicate that no grace period is in progress, at least - * until the corresponding leaf node has been initialized. - * In addition, we have excluded CPU-hotplug operations. + * Propagate new ->completed value to rcu_node + * structures so that other CPUs don't have to + * wait until the start of the next grace period + * to process their callbacks. */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); + rnp->completed = rsp->gpnum; raw_spin_unlock_irq(&rnp->lock); cond_resched(); } - rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - /* force_quiescent_state() now OK. */ - rsp->fqs_state = RCU_SIGNAL_INIT; - raw_spin_unlock_irq(&rnp->lock); - put_online_cpus(); + } + + rsp->completed = rsp->gpnum; /* Declare grace period done. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + rsp->fqs_state = RCU_GP_IDLE; + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; + raw_spin_unlock_irq(&rnp->lock); + return 1; +} + +/* + * Body of kthread that handles grace periods. + */ +static int __noreturn rcu_gp_kthread(void *arg) +{ + struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); + + for (;;) { + + /* Handle grace-period start. */ + for (;;) { + wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); + if (rsp->gp_flags && rcu_gp_init(rsp)) + break; + cond_resched(); + flush_signals(current); + } /* Handle grace-period end. */ rnp = rcu_get_root(rsp); @@ -1125,56 +1189,12 @@ static int __noreturn rcu_gp_kthread(void *arg) !ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)); if (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) + !rcu_preempt_blocked_readers_cgp(rnp) && + rcu_gp_cleanup(rsp)) break; + cond_resched(); flush_signals(current); } - - raw_spin_lock_irq(&rnp->lock); - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; - - /* - * We know the grace period is complete, but to everyone else - * it appears to still be ongoing. But it is also the case - * that to everyone else it looks like there is nothing that - * they can do to advance the grace period. It is therefore - * safe for us to drop the lock in order to mark the grace - * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. - */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock_irq(&rnp->lock); - - /* - * Propagate new ->completed value to rcu_node - * structures so that other CPUs don't have to - * wait until the start of the next grace period - * to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - } - - rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; - if (cpu_needs_another_gp(rsp, rdp)) - rsp->gp_flags = 1; - raw_spin_unlock_irq(&rnp->lock); } } -- cgit v1.2.2 From bfa00b4c4028f39357d16279ff0fddf550241593 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 09:54:10 -0700 Subject: rcu: Prevent offline CPUs from executing RCU core code Earlier versions of RCU invoked the RCU core from the CPU_DYING notifier in order to note a quiescent state for the outgoing CPU. Because the CPU is marked "offline" during the execution of the CPU_DYING notifiers, the RCU core had to tolerate being invoked from an offline CPU. However, commit b1420f1c (Make rcu_barrier() less disruptive) left only tracing code in the CPU_DYING notifier, so the RCU core need no longer execute on offline CPUs. This commit therefore enforces this restriction. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f0c0c1b4b6d4..340a5f54b6af 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1892,6 +1892,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; + if (cpu_is_offline(smp_processor_id())) + return; trace_rcu_utilization("Start RCU core"); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); -- cgit v1.2.2 From b626c1b689364859ccd2e86d5e043aeadfeb2cd4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jun 2012 17:39:43 -0700 Subject: rcu: Provide OOM handler to motivate lazy RCU callbacks In kernels built with CONFIG_RCU_FAST_NO_HZ=y, CPUs can accumulate a large number of lazy callbacks, which as the name implies will be slow to be invoked. This can be a problem on small-memory systems, where the default 6-second sleep for CPUs having only lazy RCU callbacks could well be fatal. This commit therefore installs an OOM hander that ensures that every CPU with lazy callbacks has at least one non-lazy callback, in turn ensuring timely advancement for these callbacks. Updated to fix bug that disabled OOM killing, noted by Lai Jiangshan. Updated to push the for_each_rcu_flavor() loop into rcu_oom_notify_cpu(), thus reducing the number of IPIs, as suggested by Steven Rostedt. Also to make the for_each_online_cpu() loop be preemptible. (Later, it might be good to use smp_call_function(), as suggested by Peter Zijlstra.) Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Sasha Levin Reviewed-by: Josh Triplett --- kernel/rcutree.h | 5 ++- kernel/rcutree_plugin.h | 83 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 117a15019e99..effb2733b7fc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -315,8 +315,11 @@ struct rcu_data { unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() callback. */ + /* 6) _rcu_barrier() and OOM callbacks. */ struct rcu_head barrier_head; +#ifdef CONFIG_RCU_FAST_NO_HZ + struct rcu_head oom_head; +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df01..587963689328 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include +#include #define RCU_KTHREAD_PRIO 1 @@ -2112,6 +2113,88 @@ static void rcu_idle_count_callbacks_posted(void) __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } +/* + * Data for flushing lazy RCU callbacks at OOM time. + */ +static atomic_t oom_callback_count; +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); + +/* + * RCU OOM callback -- decrement the outstanding count and deliver the + * wake-up if we are the last one. + */ +static void rcu_oom_callback(struct rcu_head *rhp) +{ + if (atomic_dec_and_test(&oom_callback_count)) + wake_up(&oom_callback_wq); +} + +/* + * Post an rcu_oom_notify callback on the current CPU if it has at + * least one lazy callback. This will unnecessarily post callbacks + * to CPUs that already have a non-lazy callback at the end of their + * callback list, but this is an infrequent operation, so accept some + * extra overhead to keep things simple. + */ +static void rcu_oom_notify_cpu(void *unused) +{ + struct rcu_state *rsp; + struct rcu_data *rdp; + + for_each_rcu_flavor(rsp) { + rdp = __this_cpu_ptr(rsp->rda); + if (rdp->qlen_lazy != 0) { + atomic_inc(&oom_callback_count); + rsp->call(&rdp->oom_head, rcu_oom_callback); + } + } +} + +/* + * If low on memory, ensure that each CPU has a non-lazy callback. + * This will wake up CPUs that have only lazy callbacks, in turn + * ensuring that they free up the corresponding memory in a timely manner. + * Because an uncertain amount of memory will be freed in some uncertain + * timeframe, we do not claim to have freed anything. + */ +static int rcu_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + int cpu; + + /* Wait for callbacks from earlier instance to complete. */ + wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + + /* + * Prevent premature wakeup: ensure that all increments happen + * before there is a chance of the counter reaching zero. + */ + atomic_set(&oom_callback_count, 1); + + get_online_cpus(); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); + cond_resched(); + } + put_online_cpus(); + + /* Unconditionally decrement: no need to wake ourselves up. */ + atomic_dec(&oom_callback_count); + + return NOTIFY_OK; +} + +static struct notifier_block rcu_oom_nb = { + .notifier_call = rcu_oom_notify +}; + +static int __init rcu_register_oom_notifier(void) +{ + register_oom_notifier(&rcu_oom_nb); + return 0; +} +early_initcall(rcu_register_oom_notifier); + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO -- cgit v1.2.2 From b402b73b3afe3614bc0e921ebe18013ea103115a Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 29 Jun 2012 14:17:29 -0700 Subject: rcu: Segregate rcu_state fields to improve cache locality The fields in the rcu_state structure that are protected by the root rcu_node structure's ->lock can share a cache line with the fields protected by ->onofflock. This can result in excessive memory contention on large systems, so this commit applies ____cacheline_internodealigned_in_smp to the ->onofflock field in order to segregate them. Signed-off-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Tested-by: Dimitri Sivanich Reviewed-by: Josh Triplett --- kernel/rcutree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index effb2733b7fc..5d92b80a0a28 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -394,7 +394,8 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t onofflock; /* exclude on/offline and */ + raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; + /* exclude on/offline and */ /* starting new GP. */ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ /* need a grace period. */ -- cgit v1.2.2 From 4cdfc175c25c89eedc08460b5e6239c2ec67fcb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Jun 2012 17:06:26 -0700 Subject: rcu: Move quiescent-state forcing into kthread As the first step towards allowing quiescent-state forcing to be preemptible, this commit moves RCU quiescent-state forcing into the same kthread that is now used to initialize and clean up after grace periods. This is yet another step towards keeping scheduling latency down to a dull roar. Updated to change from raw_spin_lock_irqsave() to raw_spin_lock_irq() and to remove the now-unused rcu_state structure fields as suggested by Peter Zijlstra. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 199 ++++++++++++++++++------------------------------ kernel/rcutree.h | 13 +--- kernel/rcutree_plugin.h | 8 +- 3 files changed, 82 insertions(+), 138 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 340a5f54b6af..6182686de4a6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -72,7 +72,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .name = #sname, \ } @@ -226,7 +225,8 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); -static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); /* @@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_bh_force_quiescent_state(void) { - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); @@ -286,7 +286,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); */ void rcu_sched_force_quiescent_state(void) { - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); @@ -784,11 +784,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) else if (!trigger_all_cpu_backtrace()) dump_stack(); - /* If so configured, complain about tasks blocking the grace period. */ + /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); - force_quiescent_state(rsp, 0); /* Kick them all. */ + force_quiescent_state(rsp); /* Kick them all. */ } static void print_cpu_stall(struct rcu_state *rsp) @@ -1036,7 +1036,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags = 0; + rsp->gp_flags = 0; /* Clear all flags: New grace period. */ if (rcu_gp_in_progress(rsp)) { /* Grace period already in progress, don't start another. */ @@ -1044,22 +1044,9 @@ static int rcu_gp_init(struct rcu_state *rsp) return 0; } - if (rsp->fqs_active) { - /* - * We need a grace period, but force_quiescent_state() - * is running. Tell it to start one on our behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irq(&rnp->lock); - return 0; - } - /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); @@ -1096,19 +1083,40 @@ static int rcu_gp_init(struct rcu_state *rsp) cond_resched(); } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - /* force_quiescent_state() now OK. */ - rsp->fqs_state = RCU_SIGNAL_INIT; - raw_spin_unlock_irq(&rnp->lock); put_online_cpus(); return 1; } +/* + * Do one round of quiescent-state forcing. + */ +int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +{ + int fqs_state = fqs_state_in; + struct rcu_node *rnp = rcu_get_root(rsp); + + rsp->n_force_qs++; + if (fqs_state == RCU_SAVE_DYNTICK) { + /* Collect dyntick-idle snapshots. */ + force_qs_rnp(rsp, dyntick_save_progress_counter); + fqs_state = RCU_FORCE_QS; + } else { + /* Handle dyntick-idle and offline CPUs. */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + } + /* Clear flag to prevent immediate re-entry. */ + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + raw_spin_unlock_irq(&rnp->lock); + } + return fqs_state; +} + /* * Clean up after the old grace period. */ -static int rcu_gp_cleanup(struct rcu_state *rsp) +static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; struct rcu_data *rdp; @@ -1160,7 +1168,6 @@ static int rcu_gp_cleanup(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); - return 1; } /* @@ -1168,6 +1175,8 @@ static int rcu_gp_cleanup(struct rcu_state *rsp) */ static int __noreturn rcu_gp_kthread(void *arg) { + int fqs_state; + int ret; struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1175,26 +1184,43 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { - wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); - if (rsp->gp_flags && rcu_gp_init(rsp)) + wait_event_interruptible(rsp->gp_wq, + rsp->gp_flags & + RCU_GP_FLAG_INIT); + if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && + rcu_gp_init(rsp)) break; cond_resched(); flush_signals(current); } - /* Handle grace-period end. */ - rnp = rcu_get_root(rsp); + /* Handle quiescent-state forcing. */ + fqs_state = RCU_SAVE_DYNTICK; for (;;) { - wait_event_interruptible(rsp->gp_wq, - !ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)); + rsp->jiffies_force_qs = jiffies + + RCU_JIFFIES_TILL_FORCE_QS; + ret = wait_event_interruptible_timeout(rsp->gp_wq, + (rsp->gp_flags & RCU_GP_FLAG_FQS) || + (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)), + RCU_JIFFIES_TILL_FORCE_QS); + /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp) && - rcu_gp_cleanup(rsp)) + !rcu_preempt_blocked_readers_cgp(rnp)) break; - cond_resched(); - flush_signals(current); + /* If time for quiescent-state forcing, do it. */ + if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { + fqs_state = rcu_gp_fqs(rsp, fqs_state); + cond_resched(); + } else { + /* Deal with stray signal. */ + cond_resched(); + flush_signals(current); + } } + + /* Handle grace-period end. */ + rcu_gp_cleanup(rsp); } } @@ -1226,7 +1252,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) return; } - rsp->gp_flags = 1; + rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock_irqrestore(&rnp->lock, flags); wake_up(&rsp->gp_wq); } @@ -1777,72 +1803,20 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +static void force_quiescent_state(struct rcu_state *rsp) { unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - trace_rcu_utilization("Start fqs"); - if (!rcu_gp_in_progress(rsp)) { - trace_rcu_utilization("End fqs"); - return; /* No grace period in progress, nothing to force. */ - } - if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) + return; /* Someone beat us to it. */ + if (!raw_spin_trylock_irqsave(&rnp->lock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - trace_rcu_utilization("End fqs"); - return; /* Someone else is already on the job. */ - } - if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) - goto unlock_fqs_ret; /* no emergency and done recently. */ - rsp->n_force_qs++; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if(!rcu_gp_in_progress(rsp)) { - rsp->n_force_qs_ngp++; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - goto unlock_fqs_ret; /* no GP in progress, time updated. */ - } - rsp->fqs_active = 1; - switch (rsp->fqs_state) { - case RCU_GP_IDLE: - case RCU_GP_INIT: - - break; /* grace period idle or initializing, ignore. */ - - case RCU_SAVE_DYNTICK: - - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - - /* Record dyntick-idle state. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - if (rcu_gp_in_progress(rsp)) - rsp->fqs_state = RCU_FORCE_QS; - break; - - case RCU_FORCE_QS: - - /* Check dyntick-idle state, send IPI to laggarts. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); - - /* Leave state in case more forcing is required. */ - - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - break; - } - rsp->fqs_active = 0; - if (rsp->fqs_need_gp) { - raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ - rsp->fqs_need_gp = 0; - rcu_start_gp(rsp, flags); /* releases rnp->lock */ - trace_rcu_utilization("End fqs"); return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ -unlock_fqs_ret: - raw_spin_unlock_irqrestore(&rsp->fqslock, flags); - trace_rcu_utilization("End fqs"); + rsp->gp_flags |= RCU_GP_FLAG_FQS; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -1858,13 +1832,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* - * If an RCU GP has gone long enough, go check for dyntick - * idle CPUs and, if needed, send resched IPIs. - */ - if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); - /* * Advance callbacks in response to end of earlier grace * period that some other CPU ended. @@ -1965,12 +1932,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); + force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; rdp->qlen_last_fqs_check = rdp->qlen; } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + } } static void @@ -2251,17 +2217,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { - - /* - * If force_quiescent_state() coming soon and this CPU - * needs a quiescent state, and this is either RCU-sched - * or RCU-bh, force a local reschedule. - */ rdp->n_rp_qs_pending++; - if (!rdp->preemptible && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, - jiffies)) - set_need_resched(); } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; @@ -2291,13 +2247,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } - /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (rcu_gp_in_progress(rsp) && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { - rdp->n_rp_need_fqs++; - return 1; - } - /* nothing to do */ rdp->n_rp_need_nothing++; return 0; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5d92b80a0a28..2d04106d1533 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -378,13 +378,6 @@ struct rcu_state { u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ - u8 fqs_active; /* force_quiescent_state() */ - /* is running. */ - u8 fqs_need_gp; /* A CPU was prevented from */ - /* starting a new grace */ - /* period because */ - /* force_quiescent_state() */ - /* was running. */ u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ @@ -413,8 +406,6 @@ struct rcu_state { struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ - raw_spinlock_t fqslock; /* Only one task forcing */ - /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -433,6 +424,10 @@ struct rcu_state { struct list_head flavors; /* List of RCU flavors. */ }; +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ +#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ + extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 587963689328..eb8dcd1bc4b5 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); */ void rcu_force_quiescent_state(void) { - force_quiescent_state(&rcu_preempt_state, 0); + force_quiescent_state(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); @@ -2076,16 +2076,16 @@ static void rcu_prepare_for_idle(int cpu) #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state, 0); + force_quiescent_state(&rcu_preempt_state); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } /* -- cgit v1.2.2 From b4be093fee0200789df59b6c90e2d099a20f55b3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Jun 2012 08:41:11 -0700 Subject: rcu: Allow RCU quiescent-state forcing to be preempted RCU quiescent-state forcing is currently carried out without preemption points, which can result in excessive latency spikes on large systems (many hundreds or thousands of CPUs). This patch therefore inserts a voluntary preemption point into force_qs_rnp(), which should greatly reduce the magnitude of these spikes. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6182686de4a6..723e2e723074 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1767,6 +1767,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { + cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); if (!rcu_gp_in_progress(rsp)) { -- cgit v1.2.2 From 4605c0143c6d611b3076025ba3a7e04293c01d69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Jun 2012 14:00:48 -0700 Subject: rcu: Adjust debugfs tracing for kthread-based quiescent-state forcing Moving quiescent-state forcing into a kthread dispenses with the need for the ->n_rp_need_fqs field, so this commit removes it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.h | 1 - kernel/rcutree_trace.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 2d04106d1533..7fb93cedc76a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -312,7 +312,6 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; - unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; /* 6) _rcu_barrier() and OOM callbacks. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94e..f54f0ceda0cf 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -386,10 +386,9 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, - rdp->n_rp_need_fqs, rdp->n_rp_need_nothing); } -- cgit v1.2.2 From 394f2769aa0dbcf027bae6fb52835e25e05d332e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Jun 2012 17:00:35 -0700 Subject: rcu: Prevent force_quiescent_state() memory contention Large systems running RCU_FAST_NO_HZ kernels see extreme memory contention on the rcu_state structure's ->fqslock field. This can be avoided by disabling RCU_FAST_NO_HZ, either at compile time or at boot time (via the nohz kernel boot parameter), but large systems will no doubt become sensitive to energy consumption. This commit therefore uses a combining-tree approach to spread the memory contention across new cache lines in the leaf rcu_node structures. This can be thought of as a tournament lock that has only a try-lock acquisition primitive. The effect on small systems is minimal, because such systems have an rcu_node "tree" consisting of a single node. In addition, this functionality is not used on fastpaths. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 47 +++++++++++++++++++++++++++++++++++++---------- kernel/rcutree.h | 1 + 2 files changed, 38 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 723e2e723074..43d57a17fcc5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -61,6 +61,7 @@ /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ @@ -1807,16 +1808,35 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) static void force_quiescent_state(struct rcu_state *rsp) { unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); + bool ret; + struct rcu_node *rnp; + struct rcu_node *rnp_old = NULL; + + /* Funnel through hierarchy to reduce memory contention. */ + rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp != NULL; rnp = rnp->parent) { + ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + !raw_spin_trylock(&rnp->fqslock); + if (rnp_old != NULL) + raw_spin_unlock(&rnp_old->fqslock); + if (ret) { + rsp->n_force_qs_lh++; + return; + } + rnp_old = rnp; + } + /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) + /* Reached the root of the rcu_node tree, acquire lock. */ + raw_spin_lock_irqsave(&rnp_old->lock, flags); + raw_spin_unlock(&rnp_old->fqslock); + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + rsp->n_force_qs_lh++; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ - if (!raw_spin_trylock_irqsave(&rnp->lock, flags)) { - rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - return; } rsp->gp_flags |= RCU_GP_FLAG_FQS; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } @@ -2704,10 +2724,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_level_0", - "rcu_node_level_1", - "rcu_node_level_2", - "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ + static char *buf[] = { "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static char *fqs[] = { "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ int cpustride = 1; int i; int j; @@ -2732,6 +2756,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); + raw_spin_lock_init(&rnp->fqslock); + lockdep_set_class_and_name(&rnp->fqslock, + &rcu_fqs_class[i], fqs[i]); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7fb93cedc76a..8f0293ce1517 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,7 @@ struct rcu_node { /* per-CPU kthreads as needed. */ unsigned int node_kthread_status; /* State of node_kthread_task for tracing. */ + raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* -- cgit v1.2.2 From d40011f601b450396104de42c631981502946cf0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Jun 2012 20:45:57 -0700 Subject: rcu: Control grace-period duration from sysfs Although almost everyone is well-served by the defaults, some uses of RCU benefit from shorter grace periods, while others benefit more from the greater efficiency provided by longer grace periods. Situations requiring a large number of grace periods to elapse (and wireshark startup has been called out as an example of this) are helped by lower-latency grace periods. Furthermore, in some embedded applications, people are willing to accept a small degradation in update efficiency (due to there being more of the shorter grace-period operations) in order to gain the lower latency. In contrast, those few systems with thousands of CPUs need longer grace periods because the CPU overhead of a grace period rises roughly linearly with the number of CPUs. Such systems normally do not make much use of facilities that require large numbers of grace periods to elapse, so this is a good tradeoff. Therefore, this commit allows the durations to be controlled from sysfs. There are two sysfs parameters, one named "jiffies_till_first_fqs" that specifies the delay in jiffies from the end of grace-period initialization until the first attempt to force quiescent states, and the other named "jiffies_till_next_fqs" that specifies the delay (again in jiffies) between subsequent attempts to force quiescent states. They both default to three jiffies, which is compatible with the old hard-coded behavior. At some future time, it may be possible to automatically increase the grace-period length with the number of CPUs, but we do not yet have sufficient data to do a good job. Preliminary data indicates that we should add an addiitonal jiffy to each of the delays for every 200 CPUs in the system, but more experimentation is needed. For now, the number of systems with more than 1,000 CPUs is small enough that this can be relegated to boot-time hand tuning. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 43d57a17fcc5..c0d3d56f0404 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -226,6 +226,12 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); +static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; + +module_param(jiffies_till_first_fqs, ulong, 0644); +module_param(jiffies_till_next_fqs, ulong, 0644); + static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -1177,6 +1183,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) static int __noreturn rcu_gp_kthread(void *arg) { int fqs_state; + unsigned long j; int ret; struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1197,14 +1204,18 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle quiescent-state forcing. */ fqs_state = RCU_SAVE_DYNTICK; + j = jiffies_till_first_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_first_fqs = HZ; + } for (;;) { - rsp->jiffies_force_qs = jiffies + - RCU_JIFFIES_TILL_FORCE_QS; + rsp->jiffies_force_qs = jiffies + j; ret = wait_event_interruptible_timeout(rsp->gp_wq, (rsp->gp_flags & RCU_GP_FLAG_FQS) || (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), - RCU_JIFFIES_TILL_FORCE_QS); + j); /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) @@ -1218,6 +1229,14 @@ static int __noreturn rcu_gp_kthread(void *arg) cond_resched(); flush_signals(current); } + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } /* Handle grace-period end. */ -- cgit v1.2.2 From 7e5c2dfb4de15e21f62c956ec32cda9372ca993b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 1 Jul 2012 15:42:33 -0700 Subject: rcu: Make rcutree module parameters visible in sysfs The module parameters blimit, qhimark, and qlomark (and more recently, rcu_fanout_leaf) have permission masks of zero, so that their values are not visible from sysfs. This is unnecessary and inconvenient to administrators who might like an easy way to see what these values are on a running system. This commit therefore sets their permission masks to 0444, allowing them to be read but not written. Reported-by: Rusty Russell Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c0d3d56f0404..f91a20c652b5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -88,7 +88,7 @@ LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; -module_param(rcu_fanout_leaf, int, 0); +module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ NUM_RCU_LVL_0, @@ -216,9 +216,9 @@ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +module_param(blimit, int, 0444); +module_param(qhimark, int, 0444); +module_param(qlowmark, int, 0444); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; -- cgit v1.2.2 From 5d4b86594984d8746b01487c768d8548463c173f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jul 2012 07:56:57 -0700 Subject: rcu: Fix day-zero grace-period initialization/cleanup race The current approach to grace-period initialization is vulnerable to extremely low-probability races. These races stem from the fact that the old grace period is marked completed on the same traversal through the rcu_node structure that is marking the start of the new grace period. This means that some rcu_node structures will believe that the old grace period is still in effect at the same time that other rcu_node structures believe that the new grace period has already started. These sorts of disagreements can result in too-short grace periods, as shown in the following scenario: 1. CPU 0 completes a grace period, but needs an additional grace period, so starts initializing one, initializing all the non-leaf rcu_node structures and the first leaf rcu_node structure. Because CPU 0 is both completing the old grace period and starting a new one, it marks the completion of the old grace period and the start of the new grace period in a single traversal of the rcu_node structures. Therefore, CPUs corresponding to the first rcu_node structure can become aware that the prior grace period has completed, but CPUs corresponding to the other rcu_node structures will see this same prior grace period as still being in progress. 2. CPU 1 passes through a quiescent state, and therefore informs the RCU core. Because its leaf rcu_node structure has already been initialized, this CPU's quiescent state is applied to the new (and only partially initialized) grace period. 3. CPU 1 enters an RCU read-side critical section and acquires a reference to data item A. Note that this CPU believes that its critical section started after the beginning of the new grace period, and therefore will not block this new grace period. 4. CPU 16 exits dyntick-idle mode. Because it was in dyntick-idle mode, other CPUs informed the RCU core of its extended quiescent state for the past several grace periods. This means that CPU 16 is not yet aware that these past grace periods have ended. Assume that CPU 16 corresponds to the second leaf rcu_node structure -- which has not yet been made aware of the new grace period. 5. CPU 16 removes data item A from its enclosing data structure and passes it to call_rcu(), which queues a callback in the RCU_NEXT_TAIL segment of the callback queue. 6. CPU 16 enters the RCU core, possibly because it has taken a scheduling-clock interrupt, or alternatively because it has more than 10,000 callbacks queued. It notes that the second most recent grace period has completed (recall that because it corresponds to the second as-yet-uninitialized rcu_node structure, it cannot yet become aware that the most recent grace period has completed), and therefore advances its callbacks. The callback for data item A is therefore in the RCU_NEXT_READY_TAIL segment of the callback queue. 7. CPU 0 completes initialization of the remaining leaf rcu_node structures for the new grace period, including the structure corresponding to CPU 16. 8. CPU 16 again enters the RCU core, again, possibly because it has taken a scheduling-clock interrupt, or alternatively because it now has more than 10,000 callbacks queued. It notes that the most recent grace period has ended, and therefore advances its callbacks. The callback for data item A is therefore in the RCU_DONE_TAIL segment of the callback queue. 9. All CPUs other than CPU 1 pass through quiescent states. Because CPU 1 already passed through its quiescent state, the new grace period completes. Note that CPU 1 is still in its RCU read-side critical section, still referencing data item A. 10. Suppose that CPU 2 wais the last CPU to pass through a quiescent state for the new grace period, and suppose further that CPU 2 did not have any callbacks queued, therefore not needing an additional grace period. CPU 2 therefore traverses all of the rcu_node structures, marking the new grace period as completed, but does not initialize a new grace period. 11. CPU 16 yet again enters the RCU core, yet again possibly because it has taken a scheduling-clock interrupt, or alternatively because it now has more than 10,000 callbacks queued. It notes that the new grace period has ended, and therefore advances its callbacks. The callback for data item A is therefore in the RCU_DONE_TAIL segment of the callback queue. This means that this callback is now considered ready to be invoked. 12. CPU 16 invokes the callback, freeing data item A while CPU 1 is still referencing it. This scenario represents a day-zero bug for TREE_RCU. This commit therefore ensures that the old grace period is marked completed in all leaf rcu_node structures before a new grace period is marked started in any of them. That said, it would have been insanely difficult to force this race to happen before the grace-period initialization process was preemptible. Therefore, this commit is not a candidate for -stable. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Conflicts: kernel/rcutree.c --- kernel/rcutree.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f91a20c652b5..145f27fe3a1f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1141,37 +1141,31 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * they can do to advance the grace period. It is therefore * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. */ - rdp = this_cpu_ptr(rsp->rda); - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq(&rnp->lock); - /* - * Propagate new ->completed value to rcu_node - * structures so that other CPUs don't have to - * wait until the start of the next grace period - * to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - } - rnp = rcu_get_root(rsp); + /* + * Propagate new ->completed value to rcu_node structures so + * that other CPUs don't have to wait until the start of the next + * grace period to process their callbacks. This also avoids + * some nasty RCU grace-period initialization races by forcing + * the end of the current grace period to be completely recorded in + * all of the rcu_node structures before the beginning of the next + * grace period is recorded in any of the rcu_node structures. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + rnp->completed = rsp->gpnum; + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; + rdp = this_cpu_ptr(rsp->rda); if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); -- cgit v1.2.2 From 661a85dc0d2ec0404e3b80909e413a9d5e42a239 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jul 2012 05:57:03 -0700 Subject: rcu: Add random PROVE_RCU_DELAY to grace-period initialization Preemption greatly raised the probability of certain types of race conditions, so this commit adds an anti-heisenbug to greatly increase the collision cross section, also known as the probability of occurrence. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 145f27fe3a1f..f0f3a18c0a20 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "rcutree.h" #include @@ -1087,6 +1088,10 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); +#ifdef CONFIG_PROVE_RCU_DELAY + if ((random32() % (rcu_num_nodes * 8)) == 0) + schedule_timeout_uninterruptible(2); +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } -- cgit v1.2.2 From 25d30cf4250f74e5ceb35f8f39739782408db633 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2012 05:23:18 -0700 Subject: rcu: Adjust for unconditional ->completed assignment Now that the rcu_node structures' ->completed fields are unconditionally assigned at grace-period cleanup time, they should already have the correct value for the new grace period at grace-period initialization time. This commit therefore inserts a WARN_ON_ONCE() to verify this invariant. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f0f3a18c0a20..a2eadd04fb29 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1080,6 +1080,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + WARN_ON_ONCE(rnp->completed != rsp->completed); rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); @@ -2777,7 +2778,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = 0; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; -- cgit v1.2.2 From bcfa57ce10d3d53d37a6e324f3010b1ce6a2784a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jul 2012 16:03:51 -0700 Subject: rcu: Eliminate signed overflow in synchronize_rcu_expedited() In the C language, signed overflow is undefined. It is true that twos-complement arithmetic normally comes to the rescue, but if the compiler can subvert this any time it has any information about the values being compared. For example, given "if (a - b > 0)", if the compiler has enough information to realize that (for example) the value of "a" is positive and that of "b" is negative, the compiler is within its rights to optimize to a simple "if (1)", which might not be what you want. This commit therefore converts synchronize_rcu_expedited()'s work-done detection counter from signed to unsigned. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index eb8dcd1bc4b5..cb5879386a02 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -677,7 +677,7 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static long sync_rcu_preempt_exp_count; +static unsigned long sync_rcu_preempt_exp_count; static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* @@ -792,7 +792,7 @@ void synchronize_rcu_expedited(void) unsigned long flags; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_preempt_state; - long snap; + unsigned long snap; int trycount = 0; smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -811,10 +811,10 @@ void synchronize_rcu_expedited(void) synchronize_rcu(); return; } - if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) goto mb_ret; /* Others did our work for us. */ } - if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) goto unlock_mb_ret; /* Others did our work for us. */ /* force all RCU readers onto ->blkd_tasks lists. */ -- cgit v1.2.2 From 1943c89de700248d68385300a9b5588a1e314f90 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Jul 2012 17:19:25 -0700 Subject: rcu: Reduce synchronize_rcu_expedited() latency The synchronize_rcu_expedited() function disables interrupts across a scan of all leaf rcu_node structures, which is not good for real-time scheduling latency on large systems (hundreds or especially thousands of CPUs). This commit therefore holds off CPU-hotplug operations using get_online_cpus(), and removes the prior acquisiion of the ->onofflock (which required disabling interrupts). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index cb5879386a02..b4e8eb24a5f1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -799,34 +799,48 @@ void synchronize_rcu_expedited(void) snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; smp_mb(); /* Above access cannot bleed into critical section. */ + /* + * Block CPU-hotplug operations. This means that any CPU-hotplug + * operation that finds an rcu_node structure with tasks in the + * process of being boosted will know that all tasks blocking + * this expedited grace period will already be in the process of + * being boosted. This simplifies the process of moving tasks + * from leaf to root rcu_node structures. + */ + get_online_cpus(); + /* * Acquire lock, falling back to synchronize_rcu() if too many * lock-acquisition failures. Of course, if someone does the * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (ULONG_CMP_LT(snap, + ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); + goto mb_ret; /* Others did our work for us. */ + } if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { + put_online_cpus(); synchronize_rcu(); return; } - if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) - goto mb_ret; /* Others did our work for us. */ } - if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) + if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); goto unlock_mb_ret; /* Others did our work for us. */ + } /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Snapshot current state of ->blkd_tasks lists. */ @@ -835,7 +849,7 @@ void synchronize_rcu_expedited(void) if (NUM_RCU_NODES > 1) sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + put_online_cpus(); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); -- cgit v1.2.2 From d7d6a11e8609f0319d4a2d8ede348f8b3374b652 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Aug 2012 15:00:05 -0700 Subject: rcu: Simplify quiescent-state detection The current quiescent-state detection algorithm is needlessly complex. It records the grace-period number corresponding to the quiescent state at the time of the quiescent state, which works, but it seems better to simply erase any record of previous quiescent states at the time that the CPU notices the new grace period. This has the further advantage of removing another piece of RCU for which lockless reasoning is required. Therefore, this commit makes this change. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 27 +++++++++++---------------- kernel/rcutree.h | 2 -- kernel/rcutree_plugin.h | 2 -- kernel/rcutree_trace.c | 12 +++++------- 4 files changed, 16 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a2eadd04fb29..6194402ec853 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -176,8 +176,6 @@ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -187,8 +185,6 @@ void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -899,12 +895,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - if (rnp->qsmask & rdp->grpmask) { - rdp->qs_pending = 1; - rdp->passed_quiesce = 0; - } else { - rdp->qs_pending = 0; - } + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } } @@ -984,10 +976,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * our behalf. Catch up with this state to avoid noting * spurious new grace periods. If another grace period * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. + * we will detect this later on. Of course, any quiescent + * states we found for the old GP are now invalid. */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { rdp->gpnum = rdp->completed; + rdp->passed_quiesce = 0; + } /* * If RCU does not need a quiescent state from this CPU, @@ -1358,7 +1353,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -1366,7 +1361,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { + if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum) { /* * The grace period in which this quiescent state was @@ -1425,7 +1421,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } #ifdef CONFIG_HOTPLUG_CPU @@ -2600,7 +2596,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - rdp->passed_quiesce_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8f0293ce1517..935dd4ca6816 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -246,8 +246,6 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - unsigned long passed_quiesce_gpnum; - /* gpnum at time of quiescent state. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index b4e8eb24a5f1..4734afbea73a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -137,8 +137,6 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index f54f0ceda0cf..bd4df13d4afb 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -86,12 +86,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); + rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -150,12 +149,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", + seq_printf(m, "%d,%s,%lu,%lu,%d,%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); + rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -186,7 +184,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) int cpu; struct rcu_state *rsp; - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST -- cgit v1.2.2 From 4dbd6bb38dd1cbfa5cb21e56e51dffc74aa20038 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Sep 2012 21:43:57 -0700 Subject: rcu: Handle unbalanced rcu_node configurations with few CPUs If CONFIG_RCU_FANOUT_EXACT=y, if there are not enough CPUs (according to nr_cpu_ids) to require more than a single rcu_node structure, but if NR_CPUS is larger than would fit into a single rcu_node structure, then the current rcu_init_levelspread() code is subject to integer overflow in the eight-bit ->levelspread[] array in the rcu_state structure. In this case, the solution is -not- to increase the size of the elements in this array because the values in that array should be constrained to the number of bits in an unsigned long. Instead, this commit replaces NR_CPUS with nr_cpu_ids in the rcu_init_levelspread() function's initialization of the cprv local variable. This results in all of the arithmetic being consistently based off of the nr_cpu_ids value, thus avoiding the overflow, which was caused by the mixing of nr_cpu_ids and NR_CPUS. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6194402ec853..8b9496fee235 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2723,7 +2723,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int cprv; int i; - cprv = NR_CPUS; + cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; -- cgit v1.2.2 From b17c7035f37f47c7f7cb08a5555ab2aebfa31f91 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Sep 2012 15:38:02 -0700 Subject: rcu: Shrink RCU based on number of CPUs Currently, rcu_init_geometry() only reshapes RCU's combining trees if the leaf fanout is changed at boot time. This means that by default, kernels compiled with (say) NR_CPUS=4096 will keep oversized data structures, even when running on systems with (say) four CPUs. This commit therefore checks to see if the maximum number of CPUs on the actual running system (nr_cpu_ids) differs from NR_CPUS, and if so reshapes the combining trees accordingly. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8b9496fee235..b703989148e4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2821,7 +2821,8 @@ static void __init rcu_init_geometry(void) int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + nr_cpu_ids == NR_CPUS) return; /* -- cgit v1.2.2 From ab840f7a06df780c4db01f34a5660b1e472d9ca6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jul 2012 12:30:22 -0700 Subject: rcu: Update rcutorture defaults A number of new features have been added to rcutorture over the years, but the defaults have not been updated to include them. This commit therefore turns on a couple of them that have proven helpful and trustworthy, namely periodic progress reports and testing of NO_HZ. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 25b15033c61f..53c548c39265 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett Date: Mon, 23 Jul 2012 12:05:55 -0700 Subject: rcu: Track CPU-hotplug duration statistics Many rcutorture runs include CPU-hotplug operations in their stress testing. This commit accumulates statistics on the durations of these operations in deference to the recent concern about the overhead and latency of these operations. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 53c548c39265..2736cc878ba2 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -177,8 +177,14 @@ static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static long n_offline_attempts; static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; static long n_online_attempts; static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; @@ -1215,11 +1221,13 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", - n_online_successes, - n_online_attempts, - n_offline_successes, - n_offline_attempts); + cnt += sprintf(&page[cnt], + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, @@ -1491,8 +1499,10 @@ static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; + unsigned long delta; int maxcpu = -1; DEFINE_RCU_RANDOM(rand); + unsigned long starttime; VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); for_each_online_cpu(cpu) @@ -1510,6 +1520,7 @@ rcu_torture_onoff(void *arg) printk(KERN_ALERT "%s" TORTURE_FLAG "rcu_torture_onoff task: offlining %d\n", torture_type, cpu); + starttime = jiffies; n_offline_attempts++; if (cpu_down(cpu) == 0) { if (verbose) @@ -1517,12 +1528,23 @@ rcu_torture_onoff(void *arg) "rcu_torture_onoff task: offlined %d\n", torture_type, cpu); n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; } } else if (cpu_is_hotpluggable(cpu)) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "rcu_torture_onoff task: onlining %d\n", torture_type, cpu); + starttime = jiffies; n_online_attempts++; if (cpu_up(cpu) == 0) { if (verbose) @@ -1530,6 +1552,16 @@ rcu_torture_onoff(void *arg) "rcu_torture_onoff task: onlined %d\n", torture_type, cpu); n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; } } schedule_timeout_interruptible(onoff_interval * HZ); -- cgit v1.2.2 From 2caa1e4432be7260dca60c3de6949b77eb007515 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Aug 2012 16:30:45 -0700 Subject: rcu: Switch rcutorture to pr_alert() and friends Drop a few characters by switching kernel/rcutorture.c from "printk(KERN_ALERT" to "pr_alert(". Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2736cc878ba2..61be03ba598d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ - do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static char printk_buf[4096]; @@ -242,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, if (fullstop == FULLSTOP_DONTSTOP) fullstop = FULLSTOP_SHUTDOWN; else - printk(KERN_WARNING /* but going down anyway, so... */ + pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; @@ -255,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, static void rcutorture_shutdown_absorb(char *title) { if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - printk(KERN_NOTICE + pr_notice( "rcutorture thread %s parking due to system shutdown\n", title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); @@ -1276,7 +1276,7 @@ rcu_torture_stats_print(void) int cnt; cnt = rcu_torture_printk(printk_buf); - printk(KERN_ALERT "%s", printk_buf); + pr_alert("%s", printk_buf); } /* @@ -1389,20 +1389,20 @@ rcu_torture_stutter(void *arg) static inline void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) { - printk(KERN_ALERT "%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval, onoff_holdoff); + pr_alert("%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d shutdown_secs=%d " + "onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration, shutdown_secs, + onoff_interval, onoff_holdoff); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1469,9 +1469,9 @@ rcu_torture_shutdown(void *arg) !kthread_should_stop()) { delta = shutdown_time - jiffies_snap; if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); } @@ -1517,16 +1517,16 @@ rcu_torture_onoff(void *arg) cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); starttime = jiffies; n_offline_attempts++; if (cpu_down(cpu) == 0) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlined %d\n", + torture_type, cpu); n_offline_successes++; delta = jiffies - starttime; sum_offline += delta; @@ -1541,16 +1541,16 @@ rcu_torture_onoff(void *arg) } } else if (cpu_is_hotpluggable(cpu)) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); starttime = jiffies; n_online_attempts++; if (cpu_up(cpu) == 0) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlined %d\n", + torture_type, cpu); n_online_successes++; delta = jiffies - starttime; sum_online += delta; @@ -1626,14 +1626,14 @@ static int __cpuinit rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - printk(KERN_ALERT "rcu_torture_stall start.\n"); + pr_alert("rcu_torture_stall start.\n"); rcu_read_lock(); preempt_disable(); while (ULONG_CMP_LT(get_seconds(), stop_at)) continue; /* Induce RCU CPU stall warning. */ preempt_enable(); rcu_read_unlock(); - printk(KERN_ALERT "rcu_torture_stall end.\n"); + pr_alert("rcu_torture_stall end.\n"); } rcutorture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) @@ -1749,12 +1749,12 @@ static int rcu_torture_barrier_init(void) if (n_barrier_cbs == 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { - printk(KERN_ALERT "%s" TORTURE_FLAG - " Call or barrier ops missing for %s,\n", - torture_type, cur_ops->name); - printk(KERN_ALERT "%s" TORTURE_FLAG - " RCU barrier testing omitted from run.\n", - torture_type); + pr_alert("%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); return 0; } atomic_set(&barrier_cbs_count, 0); @@ -1847,7 +1847,7 @@ rcu_torture_cleanup(void) mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { - printk(KERN_WARNING /* but going down anyway, so... */ + pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); schedule_timeout_uninterruptible(10); @@ -1971,17 +1971,17 @@ rcu_torture_init(void) break; } if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - printk(KERN_ALERT "rcu-torture types:"); + pr_alert("rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - printk(KERN_ALERT " %s", torture_ops[i]->name); - printk(KERN_ALERT "\n"); + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); mutex_unlock(&fullstop_mutex); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); + pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) -- cgit v1.2.2 From 60f53782c51f27c695840ce90c6c432284319eef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Aug 2012 15:27:40 -0700 Subject: rcu: Prevent initialization race in rcutorture kthreads When you do something like "t = kthread_run(...)", it is possible that the kthread will start running before the assignment to "t" happens. If the child kthread expects to find a pointer to its task_struct in "t", it will then be fatally disappointed. This commit therefore switches such cases to kthread_create() followed by wake_up_process(), guaranteeing that the assignment happens before the child kthread starts running. Reported-by: Fengguang Wu Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 61be03ba598d..aaa7b9f3532a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -2029,14 +2029,15 @@ rcu_torture_init(void) /* Start up the kthreads. */ VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_run(rcu_torture_writer, NULL, - "rcu_torture_writer"); + writer_task = kthread_create(rcu_torture_writer, NULL, + "rcu_torture_writer"); if (IS_ERR(writer_task)) { firsterr = PTR_ERR(writer_task); VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); writer_task = NULL; goto unwind; } + wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -2151,14 +2152,15 @@ rcu_torture_init(void) } if (shutdown_secs > 0) { shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_run(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); + shutdown_task = kthread_create(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); if (IS_ERR(shutdown_task)) { firsterr = PTR_ERR(shutdown_task); VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); shutdown_task = NULL; goto unwind; } + wake_up_process(shutdown_task); } i = rcu_torture_onoff_init(); if (i != 0) { -- cgit v1.2.2 From e3ebfb96f396731ca2d0b108785d5da31b53ab00 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jul 2012 14:42:01 -0700 Subject: rcu: Add PROVE_RCU_DELAY to provoke difficult races There have been some recent bugs that were triggered only when preemptible RCU's __rcu_read_unlock() was preempted just after setting ->rcu_read_lock_nesting to INT_MIN, which is a low-probability event. Therefore, reproducing those bugs (to say nothing of gaining confidence in alleged fixes) was quite difficult. This commit therefore creates a new debug-only RCU kernel config option that forces a short delay in __rcu_read_unlock() to increase the probability of those sorts of bugs occurring. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4e6a61b15e86..29ca1c6da594 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -81,6 +82,9 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; +#ifdef CONFIG_PROVE_RCU_DELAY + udelay(10); /* Make preemption more probable. */ +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); -- cgit v1.2.2 From 818615c4cde2a71a5857007b134cce89d506cc3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2012 00:24:57 -0700 Subject: rcu: Pull TINY_RCU dyntick-idle tracing into non-idle region Because TINY_RCU's idle detection keys directly off of the nesting level, rather than from a separate variable as in TREE_RCU, the TINY_RCU dyntick-idle tracing on transition to idle must happen before the change to the nesting level. This commit therefore makes this change by passing the desired new value (rather than the old value) of the nesting level in to rcu_idle_enter_common(). [ paulmck: Add fix for wrong-variable bug spotted by Michael Wang . ] Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 547b1fe5b052..e4163c5af1de 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -56,24 +56,27 @@ static void __call_rcu(struct rcu_head *head, static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long oldval) +static void rcu_idle_enter_common(long long newval) { - if (rcu_dynticks_nesting) { + if (newval) { RCU_TRACE(trace_rcu_dyntick("--=", - oldval, rcu_dynticks_nesting)); + rcu_dynticks_nesting, newval)); + rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - oldval, rcu_dynticks_nesting)); + rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + barrier(); + rcu_dynticks_nesting = newval; rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ } @@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval) void rcu_idle_enter(void) { unsigned long flags; - long long oldval; + long long newval; local_irq_save(flags); - oldval = rcu_dynticks_nesting; WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rcu_dynticks_nesting = 0; + newval = 0; else - rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(oldval); + newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; + rcu_idle_enter_common(newval); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -105,13 +107,12 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); void rcu_irq_exit(void) { unsigned long flags; - long long oldval; + long long newval; local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting--; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(oldval); + newval = rcu_dynticks_nesting - 1; + WARN_ON_ONCE(newval < 0); + rcu_idle_enter_common(newval); local_irq_restore(flags); } -- cgit v1.2.2 From 1e3fd2b38cea41f5386bf23440f2cbdd74cf13d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jul 2012 13:41:47 -0700 Subject: rcu: Properly initialize ->boost_tasks on CPU offline When rcu_preempt_offline_tasks() clears tasks from a leaf rcu_node structure, it does not NULL out the structure's ->boost_tasks field. This commit therefore fixes this issue. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df01..b1b485111321 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -584,8 +584,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; #ifdef CONFIG_RCU_BOOST - /* In case root is being boosted and leaf is not. */ + rnp->boost_tasks = NULL; + /* In case root is being boosted and leaf was not. */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ if (rnp_root->boost_tasks != NULL && rnp_root->boost_tasks != rnp_root->gp_tasks) @@ -593,8 +596,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ #endif /* #ifdef CONFIG_RCU_BOOST */ - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; return retval; } -- cgit v1.2.2 From b4270ee356e5ecef5394ab80c0a0301c1676b7f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jul 2012 10:12:48 -0700 Subject: rcu: Permit RCU_NONIDLE() to be used from interrupt context There is a need to use RCU from interrupt context, but either before rcu_irq_enter() is called or after rcu_irq_exit() is called. If the interrupt occurs from idle, then lockdep-RCU will complain about such uses, as they appear to be illegal uses of RCU from the idle loop. In other environments, RCU_NONIDLE() could be used to properly protect the use of RCU, but RCU_NONIDLE() currently cannot be invoked except from process context. This commit therefore modifies RCU_NONIDLE() to permit its use more globally. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e4163c5af1de..2e073a24d250 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -115,6 +115,7 @@ void rcu_irq_exit(void) rcu_idle_enter_common(newval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_irq_exit); /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ static void rcu_idle_exit_common(long long oldval) @@ -172,6 +173,7 @@ void rcu_irq_enter(void) rcu_idle_exit_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_irq_enter); #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.2.2 From 5cc900cf55fe58aaad93767c5a526e2a69cbcbc6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jul 2012 14:09:49 -0700 Subject: rcu: Improve boost selection when moving tasks to root rcu_node The rcu_preempt_offline_tasks() moves all tasks queued on a given leaf rcu_node structure to the root rcu_node, which is done when the last CPU corresponding the the leaf rcu_node structure goes offline. Now that RCU-preempt's synchronize_rcu_expedited() implementation blocks CPU-hotplug operations during the initialization of each rcu_node structure's ->boost_tasks pointer, rcu_preempt_offline_tasks() can do a better job of setting the root rcu_node's ->boost_tasks pointer. The key point is that rcu_preempt_offline_tasks() runs as part of the CPU-hotplug process, so that a concurrent synchronize_rcu_expedited() is guaranteed to either have not started on the one hand (in which case there is no boosting on behalf of the expedited grace period) or to be completely initialized on the other (in which case, in the absence of other priority boosting, all ->boost_tasks pointers will be initialized). Therefore, if rcu_preempt_offline_tasks() finds that the ->boost_tasks pointer is equal to the ->exp_tasks pointer, it can be sure that it is correctly placed. In the case where there was boosting ongoing at the time that the synchronize_rcu_expedited() function started, different nodes might start boosting the tasks blocking the expedited grace period at different times. In this mixed case, the root node will either be boosting tasks for the expedited grace period already, or it will start as soon as it gets done boosting for the normal grace period -- but in this latter case, the root node's tasks needed to be boosted in any case. This commit therefore adds a check of the ->boost_tasks pointer against the ->exp_tasks pointer to the list that prevents updating ->boost_tasks. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index b1b485111321..15d28febbbd4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -588,10 +588,15 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, rnp->exp_tasks = NULL; #ifdef CONFIG_RCU_BOOST rnp->boost_tasks = NULL; - /* In case root is being boosted and leaf was not. */ + /* + * In case root is being boosted and leaf was not. Make sure + * that we boost the tasks blocking the current grace period + * in this case. + */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks != rnp_root->gp_tasks && + rnp_root->boost_tasks != rnp_root->exp_tasks) rnp_root->boost_tasks = rnp_root->gp_tasks; raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ #endif /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.2 From a82dcc76021e22c174ba85d90b7a8c750b7362d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Aug 2012 14:29:20 -0700 Subject: rcu: Make offline-CPU checking allow for indefinite delays The rcu_implicit_offline_qs() function implicitly assumed that execution would progress predictably when interrupts are disabled, which is of course not guaranteed when running on a hypervisor. Furthermore, this function is short, and is called from one place only in a short function. This commit therefore ensures that the timing is checked before checking the condition, which guarantees correct behavior even given indefinite delays. It also inlines rcu_implicit_offline_qs() into rcu_implicit_dynticks_qs(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 53 +++++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..2c4ee4cdbc0e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -318,35 +318,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -/* - * If the specified CPU is offline, tell the caller that it is in - * a quiescent state. Otherwise, whack it with a reschedule IPI. - * Grace periods can end up waiting on an offline CPU when that - * CPU is in the process of coming online -- it will be added to the - * rcu_node bitmasks before it actually makes it online. The same thing - * can happen while a CPU is in the process of coming online. Because this - * race is quite rare, we check for it after detecting that the grace - * period has been delayed rather than checking each and every CPU - * each and every time we start a new grace period. - */ -static int rcu_implicit_offline_qs(struct rcu_data *rdp) -{ - /* - * If the CPU is offline for more than a jiffy, it is in a quiescent - * state. We can trust its state not to change because interrupts - * are disabled. The reason for the jiffy's worth of slack is to - * handle CPUs initializing on the way up and finding their way - * to the idle loop on the way down. - */ - if (cpu_is_offline(rdp->cpu) && - ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); - rdp->offline_fqs++; - return 1; - } - return 0; -} - /* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * @@ -675,7 +646,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU. + * for this same CPU, or by virtue of having been offline. */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { @@ -699,8 +670,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Go check for the CPU being offline. */ - return rcu_implicit_offline_qs(rdp); + /* + * Check for the CPU being offline, but only if the grace period + * is old enough. We don't need to worry about the CPU changing + * state: If we see it offline even once, it has been through a + * quiescent state. + * + * The reason for insisting that the grace period be at least + * one jiffy old is that CPUs that are not quite online and that + * have just gone offline can still execute RCU read-side critical + * sections. + */ + if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) + return 0; /* Grace period is not old enough. */ + barrier(); + if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + rdp->offline_fqs++; + return 1; + } + return 0; } static int jiffies_till_stall_check(void) -- cgit v1.2.2 From b065a85354239cc96295f696eeace67ad3a55e5c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Aug 2012 15:57:54 -0700 Subject: rcu: Fix obsolete rcu_initiate_boost() header comment Commit 1217ed1b (rcu: permit rcu_read_unlock() to be called while holding runqueue locks) made rcu_initiate_boost() restore irq state when releasing the rcu_node structure's ->lock, but failed to update the header comment accordingly. This commit therefore brings the header comment up to date. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 15d28febbbd4..c47b28bf18ae 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1197,9 +1197,9 @@ static int rcu_boost_kthread(void *arg) * kthread to start boosting them. If there is an expedited grace * period in progress, it is always time to boost. * - * The caller must hold rnp->lock, which this function releases, - * but irqs remain disabled. The ->boost_kthread_task is immortal, - * so we don't need to worry about it going away. + * The caller must hold rnp->lock, which this function releases. + * The ->boost_kthread_task is immortal, so we don't need to worry + * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { -- cgit v1.2.2 From 115f7a7ca0d412aab81acaaaa95eb1ab1c622e2f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Aug 2012 13:55:03 -0700 Subject: rcu: Apply for_each_rcu_flavor() to increment_cpu_stall_ticks() The increment_cpu_stall_ticks() function listed each RCU flavor explicitly, with an ifdef to handle preemptible RCU. This commit therefore applies for_each_rcu_flavor() to save a line of code. Because this commit switches from a code-based enumeration of the flavors of RCU to an rcu_state-list-based enumeration, it is no longer possible to apply __get_cpu_var() to the per-CPU rcu_data structures. We instead use __this_cpu_var() on the rcu_state structure's ->rda field that references the corresponding rcu_data structures. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c47b28bf18ae..70b33bf780f0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2200,11 +2200,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) /* Increment ->ticks_this_gp for all flavors of RCU. */ static void increment_cpu_stall_ticks(void) { - __get_cpu_var(rcu_sched_data).ticks_this_gp++; - __get_cpu_var(rcu_bh_data).ticks_this_gp++; -#ifdef CONFIG_TREE_PREEMPT_RCU - __get_cpu_var(rcu_preempt_data).ticks_this_gp++; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + __this_cpu_ptr(rsp->rda)->ticks_this_gp++; } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ -- cgit v1.2.2 From 5fd4dc068c4ded1339180dbcd1a99e15b1c0a728 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Aug 2012 16:00:11 -0700 Subject: rcu: Avoid rcu_print_detail_task_stall_rnp() segfault The rcu_print_detail_task_stall_rnp() function invokes rcu_preempt_blocked_readers_cgp() to verify that there are some preempted RCU readers blocking the current grace period outside of the protection of the rcu_node structure's ->lock. This means that the last blocked reader might exit its RCU read-side critical section and remove itself from the ->blkd_tasks list before the ->lock is acquired, resulting in a segmentation fault when the subsequent code attempts to dereference the now-NULL gp_tasks pointer. This commit therefore moves the test under the lock. This will not have measurable effect on lock contention because this code is invoked only when printing RCU CPU stall warnings, in other words, in the common case, never. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 70b33bf780f0..df47014e129d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -422,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return; raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) -- cgit v1.2.2 From c8020a67e625c714c4dbedc8ae2944b461e204ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Aug 2012 16:55:59 -0700 Subject: rcu: Protect rcu_node accesses during CPU stall warnings The print_other_cpu_stall() function accesses a number of rcu_node fields without protection from the ->lock. In theory, this is not a problem because the fields accessed are all integers, but in practice the compiler can get nasty. Therefore, the commit extends the existing critical section to cover the entire loop body. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2c4ee4cdbc0e..2cf8eb3e2d43 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -746,14 +746,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) { + print_cpu_stall_info(rsp, + rnp->grplo + cpu); + ndetected++; + } + } raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rnp->qsmask == 0) - continue; - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, rnp->grplo + cpu); - ndetected++; - } } /* -- cgit v1.2.2 From c96ea7cfdd88d0a67c970502bc5313fede34b86b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Aug 2012 11:17:06 -0700 Subject: rcu: Avoid spurious RCU CPU stall warnings If a given CPU avoids the idle loop but also avoids starting a new RCU grace period for a full minute, RCU can issue spurious RCU CPU stall warnings. This commit fixes this issue by adding a check for ongoing grace period to avoid these spurious stall warnings. Reported-by: Becky Bruce Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2cf8eb3e2d43..98f275296c6d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -819,7 +819,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) j = ACCESS_ONCE(jiffies); js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + if (rcu_gp_in_progress(rsp) && + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); -- cgit v1.2.2 From fdab649b1aa732cd6e79654349088465cdff49af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Aug 2012 16:34:12 -0700 Subject: rcu: Remove redundant memory barrier from __call_rcu() The first memory barrier in __call_rcu() is supposed to order any updates done beforehand by the caller against the actual queuing of the callback. However, the second memory barrier (which is intended to order incrementing the queue lengths before queuing the callback) is also between the caller's updates and the queuing of the callback. The second memory barrier can therefore serve both purposes. This commit therefore removes the first memory barrier. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 98f275296c6d..ba4f4b4c2382 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1922,8 +1922,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), head->func = func; head->next = NULL; - smp_mb(); /* Ensure RCU update seen before callback registry. */ - /* * Opportunistically note grace-period endings and beginnings. * Note that we might see a beginning right after we see an -- cgit v1.2.2 From 7a11e2058f02feb6884efb067f328012c318a13f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Aug 2012 12:14:19 -0700 Subject: rcu: Move TINY_PREEMPT_RCU away from raw_local_irq_save() The use of raw_local_irq_save() is unnecessary, given that local_irq_save() really does disable interrupts. Also, it appears to interfere with lockdep. Therefore, this commit moves to local_irq_save(). Reported-by: Fengguang Wu Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Fengguang Wu --- kernel/rcutiny_plugin.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 918fd1e8509c..3d0190282204 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -278,7 +278,7 @@ static int rcu_boost(void) rcu_preempt_ctrlblk.exp_tasks == NULL) return 0; /* Nothing to boost. */ - raw_local_irq_save(flags); + local_irq_save(flags); /* * Recheck with irqs disabled: all tasks in need of boosting @@ -287,7 +287,7 @@ static int rcu_boost(void) */ if (rcu_preempt_ctrlblk.boost_tasks == NULL && rcu_preempt_ctrlblk.exp_tasks == NULL) { - raw_local_irq_restore(flags); + local_irq_restore(flags); return 0; } @@ -317,7 +317,7 @@ static int rcu_boost(void) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - raw_local_irq_restore(flags); + local_irq_restore(flags); rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -991,9 +991,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); rcp->qlen -= n; - raw_local_irq_restore(flags); + local_irq_restore(flags); } /* -- cgit v1.2.2 From 803b0ebae921714d1c36f0996db8125eda5fae53 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Aug 2012 08:34:07 -0700 Subject: time: RCU permitted to stop idle entry via softirq The can_stop_idle_tick() function complains if a softirq vector is raised too late in the idle-entry process, presumably in order to prevent dangling softirq invocations from being delayed across the full idle period, which might be indefinitely long -- and if softirq was asserted any later than the call to this function, such a delay might well happen. However, RCU needs to be able to use softirq to stop idle entry in order to be able to drain RCU callbacks from the current CPU, which in turn enables faster entry into dyntick-idle mode, which in turn reduces power consumption. Because RCU takes this action at a well-defined point in the idle-entry path, it is safe for RCU to take this approach. This commit therefore silences the error message that is sometimes produced when the going-idle CPU suddenly finds that it has an RCU_SOFTIRQ to process. The error message will continue to be issued for other softirq vectors. Reported-by: Sedat Dilek Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Sedat Dilek Reviewed-by: Josh Triplett --- kernel/time/tick-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f74..4b1785a7bb83 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -436,7 +436,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; - if (ratelimit < 10) { + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", (unsigned int) local_softirq_pending()); ratelimit++; -- cgit v1.2.2 From 22a767269a767b3ee91e4aaea353ac6bec6a912d Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 19 Sep 2012 08:52:32 -0700 Subject: rcu: Move TINY_RCU quiescent state out of extended quiescent state TINY_RCU's rcu_idle_enter_common() invokes rcu_sched_qs() in order to inform the RCU core of the quiescent state implied by idle entry. Of course, idle is also an extended quiescent state, so that the call to rcu_sched_qs() speeds up RCU's invoking of any callbacks that might be queued. This speed-up is important when entering into dyntick-idle mode -- if there are no further scheduling-clock interrupts, the callbacks might never be invoked, which could result in a system hang. However, processing callbacks does event tracing, which in turn implies RCU read-side critical sections, which are illegal in extended quiescent states. This patch therefore moves the call to rcu_sched_qs() so that it precedes the point at which we inform lockdep that RCU has entered an extended quiescent state. Signed-off-by: Li Zhong Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 2e073a24d250..e4c6a598d6f7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -75,9 +75,9 @@ static void rcu_idle_enter_common(long long newval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ barrier(); rcu_dynticks_nesting = newval; - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ } /* -- cgit v1.2.2 From 86f343b50bb9f56cce60fade22da9defff28934c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2012 10:41:50 -0700 Subject: rcu: Fix CONFIG_RCU_FAST_NO_HZ stall warning message The print_cpu_stall_fast_no_hz() function attempts to print -1 when the ->idle_gp_timer is not pending, but unsigned arithmetic causes it to instead print ULONG_MAX, which is 4294967295 on 32-bit systems and 18446744073709551615 on 64-bit systems. Neither of these are the most reader-friendly values, so this commit instead causes "timer not pending" to be printed when ->idle_gp_timer is not pending. Reported-by: Paul Walmsley Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index df47014e129d..e12d07ba601a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2130,11 +2130,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct timer_list *tltp = &rdtp->idle_gp_timer; + char c; - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, - rdtp->dyntick_holdoff == jiffies ? 'H' : '.', - timer_pending(tltp) ? tltp->expires - jiffies : -1); + c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; + if (timer_pending(tltp)) + sprintf(cp, "drain=%d %c timer=%lu", + rdtp->dyntick_drain, c, tltp->expires - jiffies); + else + sprintf(cp, "drain=%d %c timer not pending", + rdtp->dyntick_drain, c); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.2 From 1331e7a1bbe1f11b19c4327ba0853bee2a606543 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Aug 2012 17:43:50 -0700 Subject: rcu: Remove _rcu_barrier() dependency on __stop_machine() Currently, _rcu_barrier() relies on preempt_disable() to prevent any CPU from going offline, which in turn depends on CPU hotplug's use of __stop_machine(). This patch therefore makes _rcu_barrier() use get_online_cpus() to block CPU-hotplug operations. This has the added benefit of removing the need for _rcu_barrier() to adopt callbacks: Because CPU-hotplug operations are excluded, there can be no callbacks to adopt. This commit simplifies the code accordingly. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 83 +++++++------------------------------------------- kernel/rcutree.h | 3 -- kernel/rcutree_trace.c | 4 +-- 3 files changed, 13 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..c45d3f745302 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1392,17 +1392,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - /* - * If there is an rcu_barrier() operation in progress, then - * only the task doing that operation is permitted to adopt - * callbacks. To do otherwise breaks rcu_barrier() and friends - * by causing them to fail to wait for the callbacks in the - * orphanage. - */ - if (rsp->rcu_barrier_in_progress && - rsp->rcu_barrier_in_progress != current) - return; - /* Do the accounting first. */ rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; @@ -1457,9 +1446,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them, if there is no _rcu_barrier() instance running. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. + * adopting them. There can only be one CPU hotplug operation at a time, + * so no other CPU can be attempting to update rcu_cpu_kthread_task. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { @@ -1521,10 +1509,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ -} - static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -2328,13 +2312,10 @@ static void rcu_barrier_func(void *type) static void _rcu_barrier(struct rcu_state *rsp) { int cpu; - unsigned long flags; struct rcu_data *rdp; - struct rcu_data rd; unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); unsigned long snap_done; - init_rcu_head_on_stack(&rd.barrier_head); _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ @@ -2374,70 +2355,30 @@ static void _rcu_barrier(struct rcu_state *rsp) /* * Initialize the count to one rather than to zero in order to * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Also flag this task as doing - * an rcu_barrier(). This will prevent anyone else from adopting - * orphaned callbacks, which could cause otherwise failure if a - * CPU went offline and quickly came back online. To see this, - * consider the following sequence of events: - * - * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. - * 2. CPU 1 goes offline, orphaning its callbacks. - * 3. CPU 0 adopts CPU 1's orphaned callbacks. - * 4. CPU 1 comes back online. - * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. - * 6. Both rcu_barrier_callback() callbacks are invoked, awakening - * us -- but before CPU 1's orphaned callbacks are invoked!!! + * (or preemption of this task). Exclude CPU-hotplug operations + * to ensure that no offline CPU has callbacks queued. */ init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rsp->rcu_barrier_in_progress = current; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + get_online_cpus(); /* - * Force every CPU with callbacks to register a new callback - * that will tell us when all the preceding callbacks have - * been invoked. If an offline CPU has callbacks, wait for - * it to either come back online or to finish orphaning those - * callbacks. + * Force each CPU with callbacks to register a new callback. + * When that callback is invoked, we will know that all of the + * corresponding CPU's preceding callbacks have been invoked. */ - for_each_possible_cpu(cpu) { - preempt_disable(); + for_each_online_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_is_offline(cpu)) { - _rcu_barrier_trace(rsp, "Offline", cpu, - rsp->n_barrier_done); - preempt_enable(); - while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) - schedule_timeout_interruptible(1); - } else if (ACCESS_ONCE(rdp->qlen)) { + if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); - preempt_enable(); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, rsp->n_barrier_done); - preempt_enable(); } } - - /* - * Now that all online CPUs have rcu_barrier_callback() callbacks - * posted, we can adopt all of the orphaned callbacks and place - * an rcu_barrier_callback() callback after them. When that is done, - * we are guaranteed to have an rcu_barrier_callback() callback - * following every callback that could possibly have been - * registered before _rcu_barrier() was called. - */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rcu_adopt_orphan_cbs(rsp); - rsp->rcu_barrier_in_progress = NULL; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rsp->barrier_cpu_count); - smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rd.rsp = rsp; - rsp->call(&rd.barrier_head, rcu_barrier_callback); + put_online_cpus(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2458,8 +2399,6 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); - - destroy_rcu_head_on_stack(&rd.barrier_head); } /** diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..94dfdf1f31f5 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -398,9 +398,6 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ - struct task_struct *rcu_barrier_in_progress; - /* Task doing rcu_barrier(), */ - /* or NULL if no barrier. */ struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94e..6a2e52a85d77 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", - rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + seq_printf(m, "%s: bcc: %d nbd: %lu\n", + rsp->name, atomic_read(&rsp->barrier_cpu_count), rsp->n_barrier_done); return 0; -- cgit v1.2.2 From 0d8ee37e2fcb7b77b9c5dee784beca5a215cad4c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Aug 2012 13:16:15 -0700 Subject: rcu: Disallow callback registry on offline CPUs Posting a callback after the CPU_DEAD notifier effectively leaks that callback unless/until that CPU comes back online. Silence is unhelpful when attempting to track down such leaks, so this commit emits a WARN_ON_ONCE() and unconditionally leaks the callback when an offline CPU attempts to register a callback. The rdp->nxttail[RCU_NEXT_TAIL] is set to NULL in the CPU_DEAD notifier and restored in the CPU_UP_PREPARE notifier, allowing _call_rcu() to determine exactly when posting callbacks is illegal. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c45d3f745302..be76c80a14d1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1505,6 +1505,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); + init_callback_list(rdp); + /* Disallow further callbacks on this CPU. */ + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1927,6 +1930,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { + /* _call_rcu() is illegal on offline CPU; leak the callback. */ + WARN_ON_ONCE(1); + local_irq_restore(flags); + return; + } ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; @@ -2464,6 +2473,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); -- cgit v1.2.2 From 5d18023294abc22984886bd7185344e0c2be0daf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Aug 2012 11:26:57 +0200 Subject: sched: Fix load avg vs cpu-hotplug Rabik and Paul reported two different issues related to the same few lines of code. Rabik's issue is that the nr_uninterruptible migration code is wrong in that he sees artifacts due to this (Rabik please do expand in more detail). Paul's issue is that this code as it stands relies on us using stop_machine() for unplug, we all would like to remove this assumption so that eventually we can remove this stop_machine() usage altogether. The only reason we'd have to migrate nr_uninterruptible is so that we could use for_each_online_cpu() loops in favour of for_each_possible_cpu() loops, however since nr_uninterruptible() is the only such loop and its using possible lets not bother at all. The problem Rabik sees is (probably) caused by the fact that by migrating nr_uninterruptible we screw rq->calc_load_active for both rqs involved. So don't bother with fancy migration schemes (meaning we now have to keep using for_each_possible_cpu()) and instead fold any nr_active delta after we migrate all tasks away to make sure we don't have any skewed nr_active accounting. [ paulmck: Move call to calc_load_migration to CPU_DEAD to avoid miscounting noted by Rakib. ] Reported-by: Rakib Mullick Reported-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..8c38b5e7ce47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5304,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5617,9 +5607,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + case CPU_DEAD: + { + struct rq *dest_rq; + + local_irq_save(flags); + dest_rq = cpu_rq(smp_processor_id()); + raw_spin_lock(&dest_rq->lock); + calc_load_migrate(rq); + raw_spin_unlock_irqrestore(&dest_rq->lock, flags); + } break; #endif } -- cgit v1.2.2 From 59a93c27c4892f04dfd8f91f8b64d0d6eae43e6e Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Thu, 9 Aug 2012 00:37:27 -0700 Subject: alarmtimer: Implement minimum alarm interval for allowing suspend alarmtimer suspend return -EBUSY if the next alarm will fire in less than 2 seconds. This allows one RTC seconds tick to occur subsequent to this check before the alarm wakeup time is set, ensuring the wakeup time is still in the future (assuming the RTC does not tick one more second prior to setting the alarm). If suspend is rejected due to an imminent alarm, hold a wakeup source for 2 seconds to process the alarm prior to reattempting suspend. If setting the alarm incurs an -ETIME for an alarm set in the past, or any other problem setting the alarm, abort suspend and hold a wakelock for 1 second while the alarm is allowed to be serviced or other hopefully transient conditions preventing the alarm clear up. Signed-off-by: Todd Poynor Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc8..54e7145c5414 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -46,6 +46,8 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; @@ -250,6 +252,7 @@ static int alarmtimer_suspend(struct device *dev) unsigned long flags; struct rtc_device *rtc; int i; + int ret; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; @@ -279,8 +282,10 @@ static int alarmtimer_suspend(struct device *dev) if (min.tv64 == 0) return 0; - /* XXX - Should we enforce a minimum sleep time? */ - WARN_ON(min.tv64 < NSEC_PER_SEC); + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +293,11 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); - rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - - return 0; + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; } #else static int alarmtimer_suspend(struct device *dev) @@ -821,6 +828,7 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } + ws = wakeup_source_register("alarmtimer"); return 0; out_drv: -- cgit v1.2.2 From dae373be9fec6f850159a05af3a1c36236a70d43 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 13 Sep 2012 19:12:16 -0400 Subject: alarmtimer: Use hrtimer per-alarm instead of per-base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arve HjønnevÃ¥g reported numerous crashes from the "BUG_ON(timer->state != HRTIMER_STATE_CALLBACK)" check in __run_hrtimer after it called alarmtimer_fired. It ends up the alarmtimer code was not properly handling possible failures of hrtimer_try_to_cancel, and because these faulres occur when the underlying base hrtimer is being run, this limits the ability to properly handle modifications to any alarmtimers on that base. Because much of the logic duplicates the hrtimer logic, it seems that we might as well have a per-alarmtimer hrtimer, and avoid the extra complextity of trying to multiplex many alarmtimers off of one hrtimer. Thus this patch moves the hrtimer to the alarm structure and simplifies the management logic. Changelog: v2: * Includes a fix for double alarm_start calls found by Arve Cc: Arve HjønnevÃ¥g Cc: Colin Cross Cc: Thomas Gleixner Reported-by: Arve HjønnevÃ¥g Tested-by: Arve HjønnevÃ¥g Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 94 ++++++++++++++---------------------------------- 1 file changed, 27 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 54e7145c5414..b1560ebe759f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,7 +37,6 @@ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - struct hrtimer timer; ktime_t (*gettime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -132,21 +131,17 @@ static inline void alarmtimer_rtc_timer_init(void) { } * @base: pointer to the base where the timer is being run * @alarm: pointer to alarm being enqueued. * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. + * Adds alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + timerqueue_add(&base->timerqueue, &alarm->node); alarm->state |= ALARMTIMER_STATE_ENQUEUED; - - if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { - hrtimer_try_to_cancel(&base->timer); - hrtimer_start(&base->timer, alarm->node.expires, - HRTIMER_MODE_ABS); - } } /** @@ -154,28 +149,17 @@ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) * @base: pointer to the base where the timer is running * @alarm: pointer to alarm being removed * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. + * Removes alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) { - struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); - if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) return; timerqueue_del(&base->timerqueue, &alarm->node); alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - if (next == &alarm->node) { - hrtimer_try_to_cancel(&base->timer); - next = timerqueue_getnext(&base->timerqueue); - if (!next) - return; - hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); - } } @@ -190,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { - struct alarm_base *base = container_of(timer, struct alarm_base, timer); - struct timerqueue_node *next; + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - ktime_t now; int ret = HRTIMER_NORESTART; int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); - now = base->gettime(); - while ((next = timerqueue_getnext(&base->timerqueue))) { - struct alarm *alarm; - ktime_t expired = next->expires; - - if (expired.tv64 > now.tv64) - break; - - alarm = container_of(next, struct alarm, node); - - timerqueue_del(&base->timerqueue, &alarm->node); - alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - alarm->state |= ALARMTIMER_STATE_CALLBACK; - spin_unlock_irqrestore(&base->lock, flags); - if (alarm->function) - restart = alarm->function(alarm, now); - spin_lock_irqsave(&base->lock, flags); - alarm->state &= ~ALARMTIMER_STATE_CALLBACK; + alarmtimer_remove(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->state |= ALARMTIMER_STATE_ENQUEUED; - } - } + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); - if (next) { - hrtimer_set_expires(&base->timer, next->expires); + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); ret = HRTIMER_RESTART; } spin_unlock_irqrestore(&base->lock, flags); @@ -331,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -341,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @alarm: ptr to alarm to set * @start: time to run the alarm */ -void alarm_start(struct alarm *alarm, ktime_t start) +int alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; + int ret; spin_lock_irqsave(&base->lock, flags); - if (alarmtimer_active(alarm)) - alarmtimer_remove(base, alarm); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); + ret = hrtimer_start(&alarm->timer, alarm->node.expires, + HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + return ret; } /** @@ -365,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret = -1; - spin_lock_irqsave(&base->lock, flags); - - if (alarmtimer_callback_running(alarm)) - goto out; + int ret; - if (alarmtimer_is_queued(alarm)) { + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) alarmtimer_remove(base, alarm); - ret = 1; - } else - ret = 0; -out: spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -809,10 +773,6 @@ static int __init alarmtimer_init(void) for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); - hrtimer_init(&alarm_bases[i].timer, - alarm_bases[i].base_clockid, - HRTIMER_MODE_ABS); - alarm_bases[i].timer.function = alarmtimer_fired; } error = alarmtimer_rtc_interface_setup(); -- cgit v1.2.2 From a65bcc12ad74b3efe78847945a1e36cfcbcbc4e6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 13 Sep 2012 19:25:22 -0400 Subject: alarmtimer: Rename alarmtimer_remove to alarmtimer_dequeue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that alarmtimer_remove has been simplified, change its name to _dequeue to better match its paired _enqueue function. Cc: Arve HjønnevÃ¥g Cc: Colin Cross Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b1560ebe759f..f11d83b12949 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -145,7 +145,7 @@ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) } /** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue * @base: pointer to the base where the timer is running * @alarm: pointer to alarm being removed * @@ -153,7 +153,7 @@ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) * * Must hold base->lock when calling. */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) { if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) return; @@ -181,7 +181,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); - alarmtimer_remove(base, alarm); + alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) @@ -340,7 +340,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_lock_irqsave(&base->lock, flags); ret = hrtimer_try_to_cancel(&alarm->timer); if (ret >= 0) - alarmtimer_remove(base, alarm); + alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); return ret; } -- cgit v1.2.2 From b3c869d35b9b014f63ac0beacd31c57372084d01 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Sep 2012 12:42:27 -0400 Subject: jiffies: Remove compile time assumptions about CLOCK_TICK_RATE CLOCK_TICK_RATE is used to accurately caclulate exactly how a tick will be at a given HZ. This is useful, because while we'd expect NSEC_PER_SEC/HZ, the underlying hardware will have some granularity limit, so we won't be able to have exactly HZ ticks per second. This slight error can cause timekeeping quality problems when using the jiffies or other jiffies driven clocksources. Thus we currently use compile time CLOCK_TICK_RATE value to generate SHIFTED_HZ and NSEC_PER_JIFFIES, which we then use to adjust the jiffies clocksource to correct this error. Unfortunately though, since CLOCK_TICK_RATE is a compile time value, and the jiffies clocksource is registered very early during boot, there are a number of cases where there are different possible hardware timers that have different tick rates. This causes problems in cases like ARM where there are numerous different types of hardware, each having their own compile-time CLOCK_TICK_RATE, making it hard to accurately support different hardware with a single kernel. For the most part, this doesn't matter all that much, as not too many systems actually utilize the jiffies or jiffies driven clocksource. Usually there are other highres clocksources who's granularity error is negligable. Even so, we have some complicated calcualtions that we do everywhere to handle these edge cases. This patch removes the compile time SHIFTED_HZ value, and introduces a register_refined_jiffies() function. This results in the default jiffies clock as being assumed a perfect HZ freq, and allows archtectures that care about jiffies accuracy to call register_refined_jiffies() with the tick rate, specified dynamically at boot. This allows us, where necessary, to not have a compile time CLOCK_TICK_RATE constant, simplifies the jiffies code, and still provides a way to have an accurate jiffies clock. NOTE: Since this patch does not add register_refinied_jiffies() calls for every arch, it may cause time quality regressions in some cases. Its likely these will not be noticable, but if they are an issue, adding the following to the end of setup_arch() should resolve the regression: register_refinied_jiffies(CLOCK_TICK_RATE) Cc: Catalin Marinas Cc: Arnd Bergmann Cc: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/jiffies.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10b..6629bf7b5285 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) +#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However @@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void) { return &clocksource_jiffies; } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + clocksource_register(&refined_jiffies); + return 0; +} -- cgit v1.2.2 From d7b4202e0581683f1a14fe598633da0067f5241e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Sep 2012 15:12:07 -0400 Subject: time: Move timekeeper structure to timekeeper_internal.h for vsyscall changes We're going to need to access the timekeeper in update_vsyscall, so make the structure available for those who need it. Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Martin Schwidefsky Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 56 +---------------------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3b91e75cecd..02c19d3d8e0d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,6 +8,7 @@ * */ +#include #include #include #include @@ -21,61 +22,6 @@ #include #include -/* Structure holding internal timekeeping values. */ -struct timekeeper { - /* Current clocksource used for timekeeping. */ - struct clocksource *clock; - /* NTP adjusted clock multiplier */ - u32 mult; - /* The shift value of the current clocksource. */ - u32 shift; - /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; - /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; - /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; - /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; - - /* Current CLOCK_REALTIME time in seconds */ - u64 xtime_sec; - /* Clock shifted nano seconds */ - u64 xtime_nsec; - - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ - s64 ntp_error; - /* Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. */ - u32 ntp_error_shift; - - /* - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the - * monotonic time not to jump. We need to add total_sleep_time to - * wall_to_monotonic to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ - struct timespec wall_to_monotonic; - /* Offset clock monotonic -> clock realtime */ - ktime_t offs_real; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* Offset clock monotonic -> clock boottime */ - ktime_t offs_boot; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - /* Seqlock for all timekeeper values */ - seqlock_t lock; -}; static struct timekeeper timekeeper; -- cgit v1.2.2 From 189374aed657e2228ad6b39ece438c9cdafc8dae Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Sep 2012 15:27:48 -0400 Subject: time: Move update_vsyscall definitions to timekeeper_internal.h Since users will need to include timekeeper_internal.h, move update_vsyscall definitions to timekeeper_internal.h. Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Martin Schwidefsky Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index ba744cf80696..d226c6a3fd28 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.2 From 706394211648117762edfaeffd6fc04bf3b1a75d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Sep 2012 15:34:21 -0400 Subject: time: Convert CONFIG_GENERIC_TIME_VSYSCALL to CONFIG_GENERIC_TIME_VSYSCALL_OLD To help migrate archtectures over to the new update_vsyscall method, redfine CONFIG_GENERIC_TIME_VSYSCALL as CONFIG_GENERIC_TIME_VSYSCALL_OLD Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Martin Schwidefsky Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/Kconfig | 2 +- kernel/time/timekeeping.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b75..489c86154d1f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -13,7 +13,7 @@ config ARCH_CLOCKSOURCE_DATA bool # Timekeeping vsyscall support -config GENERIC_TIME_VSYSCALL +config GENERIC_TIME_VSYSCALL_OLD bool # ktime_t scalar 64bit nsec representation diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 02c19d3d8e0d..7c2851384c46 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -199,7 +199,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) ntp_clear(); } xt = tk_xtime(tk); - update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); + update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } /** -- cgit v1.2.2 From 576094b7f0aaf41aadab9b7d4e5bd85faa432711 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 19:58:13 -0400 Subject: time: Introduce new GENERIC_TIME_VSYSCALL Now that we moved everyone over to GENERIC_TIME_VSYSCALL_OLD, introduce the new declaration and config option for the new update_vsyscall method. Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Martin Schwidefsky Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/Kconfig | 4 ++++ kernel/time/timekeeping.c | 14 +------------- 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 489c86154d1f..8601f0db1261 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,10 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL_OLD bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7c2851384c46..ce618010c373 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -42,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } -static struct timespec tk_xtime(struct timekeeper *tk) -{ - struct timespec ts; - - ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - return ts; -} - static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; @@ -192,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { - struct timespec xt; - if (clearntp) { tk->ntp_error = 0; ntp_clear(); } - xt = tk_xtime(tk); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); + update_vsyscall(tk); } /** -- cgit v1.2.2 From 92bb1fcf57a0c2e45f7e67fbf0a8ed475a749236 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Sep 2012 15:38:12 -0400 Subject: time: Only do nanosecond rounding on GENERIC_TIME_VSYSCALL_OLD systems We only do rounding to the next nanosecond so we don't see minor 1ns inconsistencies in the vsyscall implementations. Since we're changing the vsyscall implementations to avoid this, conditionalize the rounding only to the GENERIC_TIME_VSYSCALL_OLD architectures. Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Martin Schwidefsky Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ce618010c373..16280ff3cf82 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1062,6 +1062,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + +} +#else +#define old_vsyscall_fixup(tk) +#endif + + + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1073,7 +1100,6 @@ static void update_wall_time(void) cycle_t offset; int shift = 0, maxshift; unsigned long flags; - s64 remainder; write_seqlock_irqsave(&tk->lock, flags); @@ -1115,20 +1141,11 @@ static void update_wall_time(void) /* correct the clock when NTP error is too big */ timekeeping_adjust(tk, offset); - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; + * XXX This can be killed once everyone converts + * to the new update_vsyscall. + */ + old_vsyscall_fixup(tk); /* * Finally, make sure that after the rounding -- cgit v1.2.2 From 5224c3a31549f1c056039545b289e1b01ed02f12 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 7 Sep 2012 18:12:19 -0700 Subject: tracing: Add an option for disabling markers In our application, we have trace markers spread through user-space. We have markers in GL, X, etc. These are super handy for Chrome's about:tracing feature (Chrome + system + kernel trace view), but can be very distracting when you're trying to debug a kernel issue. I normally, use "grep -v tracing_mark_write" but it would be nice if I could just temporarily disable markers all together. Link: http://lkml.kernel.org/r/1347066739-26285-1-git-send-email-msb@chromium.org CC: Frederic Weisbecker Signed-off-by: Mandeep Singh Baines Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++++- kernel/trace/trace.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 08acf42e325b..1ec5c1dab629 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -328,7 +328,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -470,6 +470,7 @@ static const char *trace_options[] = { "overwrite", "disable_on_free", "irq-info", + "markers", NULL }; @@ -3886,6 +3887,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; + if (!(trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 593debefc4e9..63a2da0b9a6e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,7 @@ enum trace_iterator_flags { TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, + TRACE_ITER_MARKERS = 0x1000000, }; /* -- cgit v1.2.2 From 8781915ad2716adcd8cd5cc52cee791fc8b00fdf Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 12 Sep 2012 11:47:57 -0300 Subject: trace: Move trace event enable from fs_initcall to core_initcall This patch splits trace event initialization in two stages: * ftrace enable * sysfs event entry creation This allows to capture trace events from an earlier point by using 'trace_event' kernel parameter and is important to trace boot-up allocations. Note that, in order to enable events at core_initcall, it's necessary to move init_ftrace_syscalls() from core_initcall to early_initcall. Link: http://lkml.kernel.org/r/1347461277-25302-1-git-send-email-elezegarcia@gmail.com Signed-off-by: Ezequiel Garcia Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 108 ++++++++++++++++++++++++++++-------------- kernel/trace/trace_syscalls.c | 2 +- 2 files changed, 73 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bbb0e63d78e9..d608d09d08c0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } +static void event_remove(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ + int ret = 0; + + if (WARN_ON(!call->name)) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", + call->name); + } + + return ret; +} + static int __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, const struct file_operations *id, @@ -1209,19 +1234,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, struct dentry *d_events; int ret; - /* The linker may leave blanks */ - if (!call->name) - return -EINVAL; - - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace events/%s\n", - call->name); - return ret; - } - } + ret = event_init(call); + if (ret < 0) + return ret; d_events = event_trace_events_dir(); if (!d_events) @@ -1272,13 +1287,10 @@ static void remove_subsystem_dir(const char *name) */ static void __trace_remove_event_call(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); - if (call->event.funcs) - __unregister_ftrace_event(&call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); + event_remove(call); trace_destroy_fields(call); destroy_preds(call); + debugfs_remove_recursive(call->dir); remove_subsystem_dir(call->class->system); } @@ -1450,15 +1462,43 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +static __init int event_trace_enable(void) +{ + struct ftrace_event_call **iter, *call; + char *buf = bootup_event_buf; + char *token; + int ret; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); + } + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } + return 0; +} + static __init int event_trace_init(void) { - struct ftrace_event_call **call; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; int ret; - char *buf = bootup_event_buf; - char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1497,24 +1537,19 @@ static __init int event_trace_init(void) if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, + /* + * Early initialization already enabled ftrace event. + * Now it's only necessary to create the event directory. + */ + list_for_each_entry(call, &ftrace_events, list) { + + ret = event_create_dir(call, d_events, + &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - } - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(token, 1); - if (ret) - pr_warning("Failed to enable trace event: %s\n", token); + if (ret < 0) + event_remove(call); } ret = register_module_notifier(&trace_module_nb); @@ -1523,6 +1558,7 @@ static __init int event_trace_init(void) return 0; } +core_initcall(event_trace_enable); fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6b245f64c8dd..2485a7d09b11 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -487,7 +487,7 @@ int __init init_ftrace_syscalls(void) return 0; } -core_initcall(init_ftrace_syscalls); +early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.2 From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Sep 2012 23:04:42 +0000 Subject: net: use a per task frag allocator We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Vijay Subramanian Cc: Alexander Duyck Tested-by: Vijay Subramanian Signed-off-by: David S. Miller --- kernel/exit.c | 3 +++ kernel/fork.c | 1 + 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5bb..42f25952edd9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1046,6 +1046,9 @@ void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + if (tsk->task_frag.page) + put_page(tsk->task_frag.page); + validate_creds_for_do_exit(tsk); preempt_disable(); diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..01565b9ce0f3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + tsk->task_frag.page = NULL; account_kernel_stack(ti, 1); -- cgit v1.2.2 From bf9fae9f5e4ca8dce4708812f9ad6281e61df109 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Sep 2012 15:23:11 +0200 Subject: cputime: Use a proper subsystem naming for vtime related APIs Use a naming based on vtime as a prefix for virtual based cputime accounting APIs: - account_system_vtime() -> vtime_account() - account_switch_vtime() -> vtime_task_switch() It makes it easier to allow for further declension such as vtime_account_system(), vtime_account_idle(), ... if we want to find out the context we account to from generic code. This also make it better to know on which subsystem these APIs refer to. Signed-off-by: Frederic Weisbecker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 8 ++++---- kernel/softirq.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ba144b121f3d..21e4dcff18f3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1764,7 +1764,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul */ prev_state = prev->state; - account_switch_vtime(prev); + vtime_task_switch(prev); finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 372692bd5376..53f5b12f2821 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -10,11 +10,11 @@ /* * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU + * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old + * race with irq/vtime_account on this CPU. We would either get old * or new value with a side effect of accounting a slice of irq time to wrong * task when irq is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each irq in account_system_time. @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void account_system_vtime(struct task_struct *curr) +void vtime_account(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void account_system_vtime(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); static int irqtime_account_hi_update(void) { diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09e..d55e3159f928 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -220,7 +220,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_system_vtime(current); + vtime_account(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -271,7 +271,7 @@ restart: lockdep_softirq_exit(); - account_system_vtime(current); + vtime_account(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -340,7 +340,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - account_system_vtime(current); + vtime_account(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) -- cgit v1.2.2 From a7e1a9e3af71b45ecae2dae35851f238117b317d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Sep 2012 16:14:02 +0200 Subject: vtime: Consolidate system/idle context detection Move the code that finds out to which context we account the cputime into generic layer. Archs that consider the whole time spent in the idle task as idle time (ia64, powerpc) can rely on the generic vtime_account() and implement vtime_account_system() and vtime_account_idle(), letting the generic code to decide when to call which API. Archs that have their own meaning of idle time, such as s390 that only considers the time spent in CPU low power mode as idle time, can just override vtime_account(). Signed-off-by: Frederic Weisbecker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- kernel/sched/cputime.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 53f5b12f2821..81b763ba58a6 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -432,6 +432,32 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = cputime.utime; *st = cputime.stime; } + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + + if (in_interrupt() || !is_idle_task(tsk)) + vtime_account_system(tsk); + else + vtime_account_idle(tsk); + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + #else #ifndef nsecs_to_cputime -- cgit v1.2.2 From adf5091e6ccaa02905e7a28f9ff44f46c7f4c230 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jun 2012 11:20:21 -0700 Subject: rcu: New rcu_user_enter() and rcu_user_exit() APIs RCU currently insists that only idle tasks can enter RCU idle mode, which prohibits an adaptive tickless kernel (AKA nohz cpusets), which in turn would mean that usermode execution would always take scheduling-clock interrupts, even when there is only one task runnable on the CPU in question. This commit therefore adds rcu_user_enter() and rcu_user_exit(), which allow non-idle tasks to enter RCU idle mode. These are quite similar to rcu_idle_enter() and rcu_idle_exit(), respectively, except that they omit the idle-task checks. [ Updated to use "user" flag rather than separate check functions. ] [ paulmck: Updated to drop exports of new functions based on Josh's patch getting rid of the need for them. ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Reviewed-by: Josh Triplett --- kernel/rcutree.c | 135 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 101 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7387e46009d9..af0dc3472a4b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -322,16 +322,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * * If the new value of the ->dynticks_nesting counter now is zero, * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, + bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current)) { + if (!is_idle_task(current) && !user) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -348,7 +349,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* - * The idle task is not permitted to enter the idle loop while + * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), @@ -359,19 +360,11 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) "Illegal idle entry in RCU-sched read-side critical section."); } -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. */ -void rcu_idle_enter(void) +static void rcu_eqs_enter(bool user) { unsigned long flags; long long oldval; @@ -385,11 +378,53 @@ void rcu_idle_enter(void) rdtp->dynticks_nesting = 0; else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(rdtp, oldval); + rcu_eqs_enter_common(rdtp, oldval, user); local_irq_restore(flags); } + +/** + * rcu_idle_enter - inform RCU that current CPU is entering idle + * + * Enter idle mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. + */ +void rcu_idle_enter(void) +{ + rcu_eqs_enter(0); +} EXPORT_SYMBOL_GPL(rcu_idle_enter); +/** + * rcu_user_enter - inform RCU that we are resuming userspace. + * + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_user_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ +void rcu_user_enter(void) +{ + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + rcu_eqs_enter(1); +} + + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -420,18 +455,19 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_idle_enter_common(rdtp, oldval); + rcu_eqs_enter_common(rdtp, oldval, 1); local_irq_restore(flags); } /* - * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * rcu_eqs_exit_common - current CPU moving away from extended quiescent state * * If the new value of the ->dynticks_nesting counter was previously zero, * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, + int user) { smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); @@ -440,7 +476,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current)) { + if (!is_idle_task(current) && !user) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -452,18 +488,11 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) } } -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. */ -void rcu_idle_exit(void) +static void rcu_eqs_exit(bool user) { unsigned long flags; struct rcu_dynticks *rdtp; @@ -477,11 +506,49 @@ void rcu_idle_exit(void) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; else rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(rdtp, oldval); + rcu_eqs_exit_common(rdtp, oldval, user); local_irq_restore(flags); } + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to + * allow for the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + rcu_eqs_exit(0); +} EXPORT_SYMBOL_GPL(rcu_idle_exit); +/** + * rcu_user_exit - inform RCU that we are exiting userspace. + * + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. + */ +void rcu_user_exit(void) +{ + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + rcu_eqs_exit(1); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -515,7 +582,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_idle_exit_common(rdtp, oldval); + rcu_eqs_exit_common(rdtp, oldval, 1); local_irq_restore(flags); } -- cgit v1.2.2 From 19dd1591fc379f1d89f39cd99cbbe97433baa3c3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 4 Jun 2012 16:42:35 -0700 Subject: rcu: New rcu_user_enter_after_irq() and rcu_user_exit_after_irq() APIs In some cases, it is necessary to enter or exit userspace-RCU-idle mode from an interrupt handler, for example, if some other CPU sends this CPU a resched IPI. In this case, the current CPU would enter the IPI handler in userspace-RCU-idle mode, but would need to exit the IPI handler after having exited that mode. To allow this to work, this commit adds two new APIs to TREE_RCU: - rcu_user_enter_after_irq(). This must be called from an interrupt between rcu_irq_enter() and rcu_irq_exit(). After the irq calls rcu_irq_exit(), the irq handler will return into an RCU extended quiescent state. In theory, this interrupt is never a nested interrupt, but in practice it might interrupt softirq, which looks to RCU like a nested interrupt. - rcu_user_exit_after_irq(). This must be called from a non-nesting interrupt, interrupting an RCU extended quiescent state, also between rcu_irq_enter() and rcu_irq_exit(). After the irq calls rcu_irq_exit(), the irq handler will return in an RCU non-quiescent state. [ Combined with "Allow calls to rcu_exit_user_irq from nesting irqs." ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index af0dc3472a4b..4138f59fa2f4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -425,6 +425,27 @@ void rcu_user_enter(void) } +/** + * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace + * after the current irq returns. + * + * This is similar to rcu_user_enter() but in the context of a non-nesting + * irq. After this call, RCU enters into idle mode when the interrupt + * returns. + */ +void rcu_user_enter_after_irq(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + /* Ensure this irq is interrupting a non-idle RCU state. */ + WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); + rdtp->dynticks_nesting = 1; + local_irq_restore(flags); +} + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -549,6 +570,28 @@ void rcu_user_exit(void) rcu_eqs_exit(1); } +/** + * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace + * idle mode after the current non-nesting irq returns. + * + * This is similar to rcu_user_exit() but in the context of an irq. + * This is called when the irq has interrupted a userspace RCU idle mode + * context. When the current non-nesting interrupt returns after this call, + * the CPU won't restore the RCU idle mode. + */ +void rcu_user_exit_after_irq(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + /* Ensure we are interrupting an RCU idle mode. */ + WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); + rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; + local_irq_restore(flags); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * -- cgit v1.2.2 From 9a0c6fef423528ba5b62aa31b29aabf689eb8f70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jun 2012 12:33:51 -0700 Subject: rcu: Make RCU_FAST_NO_HZ handle adaptive ticks The current implementation of RCU_FAST_NO_HZ tries reasonably hard to rid the current CPU of RCU callbacks. This is appropriate when the CPU is entering idle, where it doesn't have much useful to do anyway, but is most definitely not what you want when transitioning to user-mode execution. This commit therefore detects the adaptive-tick case, and refrains from burning CPU time getting rid of RCU callbacks in that case. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 9c71c1b18e03..f92115488187 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1757,6 +1757,26 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; + /* Adaptive-tick mode, where usermode execution is idle to RCU. */ + if (!is_idle_task(current)) { + rdtp->dyntick_holdoff = jiffies - 1; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("User dyntick with callbacks"); + rdtp->idle_gp_timer_expires = + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); + } else if (rcu_cpu_has_callbacks(cpu)) { + rdtp->idle_gp_timer_expires = + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); + trace_rcu_prep_idle("User dyntick with lazy callbacks"); + } else { + return; + } + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + return; + } + /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle -- cgit v1.2.2 From 2b1d5024e17be459aa6385763ca3faa8f01c52d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:30 +0200 Subject: rcu: Settle config for userspace extended quiescent state Create a new config option under the RCU menu that put CPUs under RCU extended quiescent state (as in dynticks idle mode) when they run in userspace. This require some contribution from architectures to hook into kernel and userspace boundaries. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4138f59fa2f4..79fa2db1595b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -400,6 +400,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); +#ifdef CONFIG_RCU_USER_QS /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -424,7 +425,6 @@ void rcu_user_enter(void) rcu_eqs_enter(1); } - /** * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace * after the current irq returns. @@ -445,6 +445,7 @@ void rcu_user_enter_after_irq(void) rdtp->dynticks_nesting = 1; local_irq_restore(flags); } +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -548,6 +549,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); +#ifdef CONFIG_RCU_USER_QS /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -591,6 +593,7 @@ void rcu_user_exit_after_irq(void) rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; local_irq_restore(flags); } +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle -- cgit v1.2.2 From c5d900bf676b1e2a61c44483932c8088651bbb4e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:31 +0200 Subject: rcu: Allow rcu_user_enter()/exit() to nest Allow calls to rcu_user_enter() even if we are already in userspace (as seen by RCU) and allow calls to rcu_user_exit() even if we are already in the kernel. This makes the APIs more flexible to be called from architectures. Exception entries for example won't need to know if they come from userspace before calling rcu_user_exit(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 41 +++++++++++++++++++++++++++++++++-------- kernel/rcutree.h | 3 +++ 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 79fa2db1595b..d62c04482228 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -366,11 +366,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, */ static void rcu_eqs_enter(bool user) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); @@ -379,7 +377,6 @@ static void rcu_eqs_enter(bool user) else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_eqs_enter_common(rdtp, oldval, user); - local_irq_restore(flags); } /** @@ -396,7 +393,11 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { + unsigned long flags; + + local_irq_save(flags); rcu_eqs_enter(0); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -411,6 +412,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { + unsigned long flags; + struct rcu_dynticks *rdtp; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -422,7 +426,15 @@ void rcu_user_enter(void) if (in_interrupt()) return; - rcu_eqs_enter(1); + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->in_user) { + rdtp->in_user = true; + rcu_eqs_enter(1); + } + local_irq_restore(flags); } /** @@ -516,11 +528,9 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, */ static void rcu_eqs_exit(bool user) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); @@ -529,7 +539,6 @@ static void rcu_eqs_exit(bool user) else rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(rdtp, oldval, user); - local_irq_restore(flags); } /** @@ -545,7 +554,11 @@ static void rcu_eqs_exit(bool user) */ void rcu_idle_exit(void) { + unsigned long flags; + + local_irq_save(flags); rcu_eqs_exit(0); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -558,6 +571,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_user_exit(void) { + unsigned long flags; + struct rcu_dynticks *rdtp; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -569,7 +585,13 @@ void rcu_user_exit(void) if (in_interrupt()) return; - rcu_eqs_exit(1); + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (rdtp->in_user) { + rdtp->in_user = false; + rcu_eqs_exit(1); + } + local_irq_restore(flags); } /** @@ -2586,6 +2608,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); +#ifdef CONFIG_RCU_USER_QS + WARN_ON_ONCE(rdp->dynticks->in_user); +#endif rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7576fd4d8ce6..10cc2f9f8433 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,6 +102,9 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ +#ifdef CONFIG_RCU_USER_QS + bool in_user; /* Is the CPU in userland from RCU POV? */ +#endif }; /* RCU's kthread states for tracing. */ -- cgit v1.2.2 From 1e1a689f10a27a4fe1ab9b4c6db04fa7232746a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:32 +0200 Subject: rcu: Ignore userspace extended quiescent state by default By default we don't want to enter into RCU extended quiescent state while in userspace because doing this produces some overhead (eg: use of syscall slowpath). Set it off by default and ready to run when some feature like adaptive tickless need it. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 ++++- kernel/rcutree.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d62c04482228..6b82a9565149 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -206,6 +206,9 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_RCU_USER_QS + .ignore_user_qs = true, +#endif }; static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -430,7 +433,7 @@ void rcu_user_enter(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->in_user) { + if (!rdtp->ignore_user_qs && !rdtp->in_user) { rdtp->in_user = true; rcu_eqs_enter(1); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 10cc2f9f8433..5faf05d68326 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -103,6 +103,7 @@ struct rcu_dynticks { int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ #ifdef CONFIG_RCU_USER_QS + bool ignore_user_qs; /* Treat userspace as extended QS or not */ bool in_user; /* Is the CPU in userland from RCU POV? */ #endif }; -- cgit v1.2.2 From 04e7e951532b390b16feb070be9972b8fad2fc57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Jul 2012 15:06:40 -0700 Subject: rcu: Switch task's syscall hooks on context switch Clear the syscalls hook of a task when it's scheduled out so that if the task migrates, it doesn't run the syscall slow path on a CPU that might not need it. Also set the syscalls hook on the next task if needed. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 15 +++++++++++++++ kernel/sched/core.c | 1 + 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b82a9565149..d2e74c8d4b0e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -717,6 +717,21 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_RCU_USER_QS +void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next) +{ + struct rcu_dynticks *rdtp; + + /* Interrupts are disabled in context switch */ + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->ignore_user_qs) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} +#endif /* #ifdef CONFIG_RCU_USER_QS */ + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a48cdbc8631..ea2213b07d9d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch(prev, next); switch_to(prev, next, prev); barrier(); -- cgit v1.2.2 From 90a340ed53f0f3bcc3fdf1b2cff56c0e4e911d01 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:36 +0200 Subject: rcu: Exit RCU extended QS on kernel preemption after irq/exception When an exception or an irq exits, and we are going to resume into interrupted kernel code, the low level architecture code calls preempt_schedule_irq() if there is a need to reschedule. If the interrupt/exception occured between a call to rcu_user_enter() (from syscall exit, exception exit, do_notify_resume exit, ...) and a real resume to userspace (iret,...), preempt_schedule_irq() can be called whereas RCU thinks we are in userspace. But preempt_schedule_irq() is going to run kernel code and may be some RCU read side critical section. We must exit the userspace extended quiescent state before we call it. To solve this, just call rcu_user_exit() in the beginning of preempt_schedule_irq(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea2213b07d9d..4adcd237c545 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3570,6 +3570,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); + rcu_user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); -- cgit v1.2.2 From 20ab65e33f469c35f3dabde3445b668aa9c943ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:37 +0200 Subject: rcu: Exit RCU extended QS on user preemption When exceptions or irq are about to resume userspace, if the task needs to be rescheduled, the arch low level code calls schedule() directly. If we call it, it is because we have the TIF_RESCHED flag: - It can be set after random local calls to set_need_resched() (RCU, drm, ...) - A wake up happened and the CPU needs preemption. This can happen in several ways: * Remotely: the remote waking CPU has set TIF_RESCHED and send the wakee an IPI to schedule the new task. * Remotely enqueued: the remote waking CPU sends an IPI to the target and the wake up is made by the target. * Locally: waking CPU == wakee CPU and the wakeup is done locally. set_need_resched() is called without IPI. In the case of local and remotely enqueued wake ups, the tick can be restarted when we enqueue the new task and RCU can exit the extended quiescent state at the same time. Then by the time we reach irq exit path and we call schedule, we are not in RCU user mode. But if we call schedule() only because something called set_need_resched(), RCU may still be in user mode when we reach schedule. Also if a wake up is done remotely, the CPU might see the TIF_RESCHED flag and call schedule while the IPI has not yet happen to restart the tick and exit RCU user mode. We need to manually protect against these corner cases. Create a new API schedule_user() that calls schedule() inside rcu_user_exit()-rcu_user_enter() in order to protect it. Archs will need to rely on it now to implement user preemption safely. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/sched/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4adcd237c545..3c4dec0594d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3469,6 +3469,21 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_RCU_USER_QS +asmlinkage void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + */ + rcu_user_exit(); + schedule(); + rcu_user_enter(); +} +#endif + /** * schedule_preempt_disabled - called with preemption disabled * -- cgit v1.2.2 From 1fd2b4425a5702c112b441e20b250ac8833a9608 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:40 +0200 Subject: rcu: Userspace RCU extended QS selftest Provide a config option that enables the userspace RCU extended quiescent state on every CPUs by default. This is for testing purpose. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2e74c8d4b0e..812d04b6b395 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -206,7 +206,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_RCU_USER_QS +#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) .ignore_user_qs = true, #endif }; -- cgit v1.2.2 From cb349ca95407cbc11424d5e9fc7c8e700709041b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Sep 2012 17:35:31 -0700 Subject: rcu: Apply micro-optimization and int/bool fixes to RCU's idle handling Checking "user" before "is_idle_task()" allows better optimizations in cases where inlining is possible. Also, "bool" should be passed "true" or "false" rather than "1" or "0". This commit therefore makes these changes, as noted in Josh's review. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 812d04b6b395..4fb2376ddf06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -335,7 +335,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current) && !user) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -399,7 +399,7 @@ void rcu_idle_enter(void) unsigned long flags; local_irq_save(flags); - rcu_eqs_enter(0); + rcu_eqs_enter(false); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -435,7 +435,7 @@ void rcu_user_enter(void) rdtp = &__get_cpu_var(rcu_dynticks); if (!rdtp->ignore_user_qs && !rdtp->in_user) { rdtp->in_user = true; - rcu_eqs_enter(1); + rcu_eqs_enter(true); } local_irq_restore(flags); } @@ -492,7 +492,7 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, 1); + rcu_eqs_enter_common(rdtp, oldval, true); local_irq_restore(flags); } @@ -513,7 +513,7 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current) && !user) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -560,7 +560,7 @@ void rcu_idle_exit(void) unsigned long flags; local_irq_save(flags); - rcu_eqs_exit(0); + rcu_eqs_exit(false); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -592,7 +592,7 @@ void rcu_user_exit(void) rdtp = &__get_cpu_var(rcu_dynticks); if (rdtp->in_user) { rdtp->in_user = false; - rcu_eqs_exit(1); + rcu_eqs_exit(true); } local_irq_restore(flags); } @@ -653,7 +653,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, 1); + rcu_eqs_exit_common(rdtp, oldval, true); local_irq_restore(flags); } -- cgit v1.2.2 From 5a14fead07bcf4e0acc877a8d9e1d1f40a441153 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 24 Sep 2012 14:27:50 -0700 Subject: kernel/debug: Mask KGDB NMI upon entry The new arch callback should manage NMIs that usually cause KGDB to enter. That is, not all NMIs should be enabled/disabled, but only those that issue kgdb_handle_exception(). We must mask it as serial-line interrupt can be used as an NMI, so if the original KGDB-entry cause was say a breakpoint, then every input to KDB console will cause KGDB to reenter, which we don't want. Signed-off-by: Anton Vorontsov Acked-by: Jason Wessel Signed-off-by: Greg Kroah-Hartman --- kernel/debug/debug_core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..17e073c309e6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; + int ret = 0; + + if (arch_kgdb_ops.enable_nmi) + arch_kgdb_ops.enable_nmi(0); ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; @@ -681,11 +685,15 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ + goto out; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - return 0; + goto out; - return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); + ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +out: + if (arch_kgdb_ops.enable_nmi) + arch_kgdb_ops.enable_nmi(1); + return ret; } int kgdb_nmicallback(int cpu, void *regs) -- cgit v1.2.2 From ad394f66fa57ae66014cb74f337e2820bac4c417 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 24 Sep 2012 14:27:51 -0700 Subject: kdb: Implement disable_nmi command This command disables NMI-entry. If NMI source has been previously shared with a serial console ("debug port"), this effectively releases the port from KDB exclusive use, and makes the console available for normal use. Of course, NMI can be reenabled, enable_nmi modparam is used for that: echo 1 > /sys/module/kdb/parameters/enable_nmi Signed-off-by: Anton Vorontsov Acked-by: Jason Wessel Signed-off-by: Greg Kroah-Hartman --- kernel/debug/kdb/kdb_main.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a9..1261dc7eaeb9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -2107,6 +2108,32 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ + +/* Make sure we balance enable/disable calls, must disable first. */ +static atomic_t kdb_nmi_disabled; + +static int kdb_disable_nmi(int argc, const char *argv[]) +{ + if (atomic_read(&kdb_nmi_disabled)) + return 0; + atomic_set(&kdb_nmi_disabled, 1); + arch_kgdb_ops.enable_nmi(0); + return 0; +} + +static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) +{ + if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) + return -EINVAL; + arch_kgdb_ops.enable_nmi(1); + return 0; +} + +static const struct kernel_param_ops kdb_param_ops_enable_nmi = { + .set = kdb_param_enable_nmi, +}; +module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); + /* * kdb_cpu - This function implements the 'cpu' command. * cpu [] @@ -2851,6 +2878,10 @@ static void __init kdb_inittab(void) kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", "Display syslog buffer", 0, KDB_REPEAT_NONE); #endif + if (arch_kgdb_ops.enable_nmi) { + kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); + } kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); kdb_register_repeat("kill", kdb_kill, "<-signal> ", -- cgit v1.2.2 From ab72a7028c0cc22731dc60beceb595b321d1cdb9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Aug 2012 09:40:46 -0400 Subject: events: don't use get_unused_fd_flags() when get_unused_fd() will do Signed-off-by: Al Viro --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7fee567153f0..2bd199bfaef8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6264,7 +6264,7 @@ SYSCALL_DEFINE5(perf_event_open, if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; - event_fd = get_unused_fd_flags(O_RDWR); + event_fd = get_unused_fd(); if (event_fd < 0) return event_fd; -- cgit v1.2.2 From 7cf4dc3c8dbfdfde163d4636f621cf99a1f63bfb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Aug 2012 19:56:12 -0400 Subject: move files_struct-related bits from kernel/exit.c to fs/file.c Signed-off-by: Al Viro --- kernel/exit.c | 93 ----------------------------------------------------------- 1 file changed, 93 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5bb..20dfc7617c2e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -466,99 +466,6 @@ void daemonize(const char *name, ...) EXPORT_SYMBOL(daemonize); -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); - for (;;) { - unsigned long set; - i = j * BITS_PER_LONG; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - rcu_read_unlock(); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - #ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. -- cgit v1.2.2 From 864bdb3b6cbd9911222543fef1cfe36f88183f44 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 22 Aug 2012 18:42:10 -0400 Subject: new helper: daemonize_descriptors() descriptor-related parts of daemonize, done right. As the result we simplify the locking rules for ->files - we hold task_lock in *all* cases when we modify ->files. Signed-off-by: Al Viro --- kernel/exit.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 20dfc7617c2e..095113321318 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -457,9 +457,7 @@ void daemonize(const char *name, ...) /* Become as one with the init task */ daemonize_fs_struct(); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); + daemonize_descriptors(); reparent_to_kthreadd(); } -- cgit v1.2.2 From e10ce27f0df9eda7b36eb16e553f07a9e05c6bba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 13:02:21 -0400 Subject: switch prctl_set_mm_exe_file() to fget_light() Signed-off-by: Al Viro --- kernel/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23eca..0cb4283df884 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1790,9 +1790,9 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct file *exe_file; struct dentry *dentry; - int err; + int err, fput_needed; - exe_file = fget(fd); + exe_file = fget_light(fd, &fput_needed); if (!exe_file) return -EBADF; @@ -1839,12 +1839,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_unlock; err = 0; - set_mm_exe_file(mm, exe_file); + set_mm_exe_file(mm, exe_file); /* this grabs a reference to exe_file */ exit_unlock: up_write(&mm->mmap_sem); exit: - fput(exe_file); + fput_light(exe_file, fput_needed); return err; } -- cgit v1.2.2 From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Aug 2012 12:52:22 -0400 Subject: switch simple cases of fget_light to fdget Signed-off-by: Al Viro --- kernel/events/core.c | 70 ++++++++++++++++++++++------------------------------ kernel/sys.c | 16 ++++++------ kernel/taskstats.c | 11 ++++----- 3 files changed, 43 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2bd199bfaef8..bd9c5bca42ae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -467,14 +467,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct file *file; - int ret = 0, fput_needed; + struct fd f = fdget(fd); + int ret = 0; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; - css = cgroup_css_from_dir(file, perf_subsys_id); + css = cgroup_css_from_dir(f.file, perf_subsys_id); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -500,7 +499,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - fput_light(file, fput_needed); + fdput(f); return ret; } @@ -3233,21 +3232,18 @@ unlock: static const struct file_operations perf_fops; -static struct file *perf_fget_light(int fd, int *fput_needed) +static inline int perf_fget_light(int fd, struct fd *p) { - struct file *file; - - file = fget_light(fd, fput_needed); - if (!file) - return ERR_PTR(-EBADF); + struct fd f = fdget(fd); + if (!f.file) + return -EBADF; - if (file->f_op != &perf_fops) { - fput_light(file, *fput_needed); - *fput_needed = 0; - return ERR_PTR(-EBADF); + if (f.file->f_op != &perf_fops) { + fdput(f); + return -EBADF; } - - return file; + *p = f; + return 0; } static int perf_event_set_output(struct perf_event *event, @@ -3279,22 +3275,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { - struct file *output_file = NULL; - struct perf_event *output_event = NULL; - int fput_needed = 0; int ret; - if (arg != -1) { - output_file = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_file)) - return PTR_ERR(output_file); - output_event = output_file->private_data; + struct perf_event *output_event; + struct fd output; + ret = perf_fget_light(arg, &output); + if (ret) + return ret; + output_event = output.file->private_data; + ret = perf_event_set_output(event, output_event); + fdput(output); + } else { + ret = perf_event_set_output(event, NULL); } - - ret = perf_event_set_output(event, output_event); - if (output_event) - fput_light(output_file, fput_needed); - return ret; } @@ -6229,12 +6222,11 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct file *group_file = NULL; + struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; - int fput_needed = 0; int err; /* for future expandability... */ @@ -6269,12 +6261,10 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_file = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_file)) { - err = PTR_ERR(group_file); + err = perf_fget_light(group_fd, &group); + if (err) goto err_fd; - } - group_leader = group_file->private_data; + group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6450,7 +6440,7 @@ SYSCALL_DEFINE5(perf_event_open, * of the group leader will find the pointer to itself in * perf_group_detach(). */ - fput_light(group_file, fput_needed); + fdput(group); fd_install(event_fd, event_file); return event_fd; @@ -6464,7 +6454,7 @@ err_task: if (task) put_task_struct(task); err_group_fd: - fput_light(group_file, fput_needed); + fdput(group); err_fd: put_unused_fd(event_fd); return err; diff --git a/kernel/sys.c b/kernel/sys.c index 0cb4283df884..f9492284e5d2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,15 +1788,15 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct file *exe_file; + struct fd exe; struct dentry *dentry; - int err, fput_needed; + int err; - exe_file = fget_light(fd, &fput_needed); - if (!exe_file) + exe = fdget(fd); + if (!exe.file) return -EBADF; - dentry = exe_file->f_path.dentry; + dentry = exe.file->f_path.dentry; /* * Because the original mm->exe_file points to executable file, make @@ -1805,7 +1805,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EACCES; if (!S_ISREG(dentry->d_inode->i_mode) || - exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; err = inode_permission(dentry->d_inode, MAY_EXEC); @@ -1839,12 +1839,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_unlock; err = 0; - set_mm_exe_file(mm, exe_file); /* this grabs a reference to exe_file */ + set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ exit_unlock: up_write(&mm->mmap_sem); exit: - fput_light(exe_file, fput_needed); + fdput(exe); return err; } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..5116b7e5962e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -415,16 +415,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) struct nlattr *na; size_t size; u32 fd; - struct file *file; - int fput_needed; + struct fd f; na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return 0; size = nla_total_size(sizeof(struct cgroupstats)); @@ -444,7 +443,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, file->f_dentry); + rc = cgroupstats_build(stats, f.file->f_dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; @@ -453,7 +452,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = send_reply(rep_skb, info); err: - fput_light(file, fput_needed); + fdput(f); return rc; } -- cgit v1.2.2 From c99af3752bb52ba3aece5315279a57a477edfaf1 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 22 Jun 2012 13:49:31 -0400 Subject: module: taint kernel when lve module is loaded Cloudlinux have a product called lve that includes a kernel module. This was previously GPLed but is now under a proprietary license, but the module continues to declare MODULE_LICENSE("GPL") and makes use of some EXPORT_SYMBOL_GPL symbols. Forcibly taint it in order to avoid this. Signed-off-by: Matthew Garrett Cc: Alex Lyashkov Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/module.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4edbd9c11aca..9ad9ee9406d6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2730,6 +2730,10 @@ static int check_module_license_and_versions(struct module *mod) if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + /* lve claims to be GPL but upstream won't provide source */ + if (strcmp(mod->name, "lve") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) -- cgit v1.2.2 From 786d35d45cc40b2a51a18f73e14e135d47fdced7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 28 Sep 2012 14:31:03 +0930 Subject: Make most arch asm/module.h files use asm-generic/module.h Use the mapping of Elf_[SPE]hdr, Elf_Addr, Elf_Sym, Elf_Dyn, Elf_Rel/Rela, ELF_R_TYPE() and ELF_R_SYM() to either the 32-bit version or the 64-bit version into asm-generic/module.h for all arches bar MIPS. Also, use the generic definition mod_arch_specific where possible. To this end, I've defined three new config bools: (*) HAVE_MOD_ARCH_SPECIFIC Arches define this if they don't want to use the empty generic mod_arch_specific struct. (*) MODULES_USE_ELF_RELA Arches define this if their modules can contain RELA records. This causes the Elf_Rela mapping to be emitted and allows apply_relocate_add() to be defined by the arch rather than have the core emit an error message. (*) MODULES_USE_ELF_REL Arches define this if their modules can contain REL records. This causes the Elf_Rel mapping to be emitted and allows apply_relocate() to be defined by the arch rather than have the core emit an error message. Note that it is possible to allow both REL and RELA records: m68k and mips are two arches that do this. With this, some arch asm/module.h files can be deleted entirely and replaced with a generic-y marker in the arch Kbuild file. Additionally, I have removed the bits from m32r and score that handle the unsupported type of relocation record as that's now handled centrally. Signed-off-by: David Howells Acked-by: Sam Ravnborg Signed-off-by: Rusty Russell --- kernel/module.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9ad9ee9406d6..7f2ee45f362c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1949,26 +1949,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) return ret; } -int __weak apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: REL relocation unsupported\n", me->name); - return -ENOEXEC; -} - -int __weak apply_relocate_add(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: RELA relocation unsupported\n", me->name); - return -ENOEXEC; -} - static int apply_relocations(struct module *mod, const struct load_info *info) { unsigned int i; -- cgit v1.2.2 From 6f13909f4fe9652f189b462c6c98767309000321 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Sep 2012 14:31:03 +0930 Subject: module: fix symbol waiting when module fails before init We use resolve_symbol_wait(), which blocks if the module containing the symbol is still loading. However: 1) The module_wq we use is only woken after calling the modules' init function, but there are other failure paths after the module is placed in the linked list where we need to do the same thing. 2) wake_up() only wakes one waiter, and our waitqueue is shared by all modules, so we need to wake them all. 3) wake_up_all() doesn't imply a memory barrier: I feel happier calling it after we've grabbed and dropped the module_mutex, not just after the state assignment. Signed-off-by: Rusty Russell --- kernel/module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7f2ee45f362c..63cf6e7f1394 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2959,7 +2959,7 @@ static struct module *load_module(void __user *umod, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); module_bug_cleanup(mod); - + wake_up_all(&module_wq); ddebug: dynamic_debug_remove(info.debug); unlock: @@ -3034,7 +3034,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); free_module(mod); - wake_up(&module_wq); + wake_up_all(&module_wq); return ret; } if (ret > 0) { @@ -3046,9 +3046,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, dump_stack(); } - /* Now it's a first class citizen! Wake up anyone waiting for it. */ + /* Now it's a first class citizen! */ mod->state = MODULE_STATE_LIVE; - wake_up(&module_wq); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); @@ -3071,6 +3070,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->init_ro_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); + wake_up_all(&module_wq); return 0; } -- cgit v1.2.2 From 9bb9c3be56834653878f766f471fa1c20e562f4c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Sep 2012 14:31:03 +0930 Subject: module: wait when loading a module which is currently initializing. The original module-init-tools module loader used a fnctl lock on the .ko file to avoid attempts to simultaneously load a module. Unfortunately, you can't get an exclusive fcntl lock on a read-only fd, making this not work for read-only mounted filesystems. module-init-tools has a hacky sleep-and-loop for this now. It's not that hard to wait in the kernel, and only return -EEXIST once the first module has finished loading (or continue loading the module if the first one failed to initialize for some reason). It's also consistent with what we do for dependent modules which are still loading. Suggested-by: Lucas De Marchi Signed-off-by: Rusty Russell --- kernel/module.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 63cf6e7f1394..74bc19562ca3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2845,6 +2845,20 @@ static int post_relocation(struct module *mod, const struct load_info *info) return module_finalize(info->hdr, info->sechdrs, mod); } +/* Is this module of this name done loading? No locks held. */ +static bool finished_loading(const char *name) +{ + struct module *mod; + bool ret; + + mutex_lock(&module_mutex); + mod = find_module(name); + ret = !mod || mod->state != MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + return ret; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static struct module *load_module(void __user *umod, @@ -2852,7 +2866,7 @@ static struct module *load_module(void __user *umod, const char __user *uargs) { struct load_info info = { NULL, }; - struct module *mod; + struct module *mod, *old; long err; pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2918,8 +2932,18 @@ static struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ +again: mutex_lock(&module_mutex); - if (find_module(mod->name)) { + if ((old = find_module(mod->name)) != NULL) { + if (old->state == MODULE_STATE_COMING) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto free_arch_cleanup; + goto again; + } err = -EEXIST; goto unlock; } -- cgit v1.2.2 From d55cb6cf143ae16eaa415baab520b8eaf4a1012f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Aug 2012 21:31:10 +0900 Subject: ftrace: Allow stealing pages from pipe buffer Use generic steal operation on pipe buffer to allow stealing ring buffer's read page from pipe buffer. Note that this could reduce the performance of splice on the splice_write side operation without affinity setting. Since the ring buffer's read pages are allocated on the tracing-node, but the splice user does not always execute splice write side operation on the same node. In this case, the page will be accessed from the another node. Thus, it is strongly recommended to assign the splicing thread to corresponding node. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/trace/trace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496ce..adde0994911e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4195,12 +4195,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4216,7 +4210,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; -- cgit v1.2.2 From 79d54b249c176ba4abb9a580951400246dd974b1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 18:03:59 +0200 Subject: uprobes: Do not leak UTASK_BP_HIT if find_active_uprobe() fails If handle_swbp()->find_active_uprobe() fails we return with utask->state = UTASK_BP_HIT. Change handle_swbp() to reset utask->state at the start. Note that we do this unconditionally, see the next patch(es). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..2c1ff05af6f5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1468,6 +1468,10 @@ static void handle_swbp(struct pt_regs *regs) bp_vaddr = uprobe_get_swbp_addr(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + utask = current->utask; + if (utask) + utask->state = UTASK_RUNNING; + if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -1486,7 +1490,6 @@ static void handle_swbp(struct pt_regs *regs) return; } - utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ -- cgit v1.2.2 From 746a9e6ba24af2ccf03279c99d435a1b88ca5d17 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 18:23:51 +0200 Subject: uprobes: Do not setup ->active_uprobe/state prematurely handle_swbp() sets utask->active_uprobe before handler_chain(), and UTASK_SSTEP before pre_ssout(). This complicates the code for no reason, arch_ hooks or consumer->handler() should not (and can't) use this info. Change handle_swbp() to initialize them after pre_ssout(), and remove the no longer needed cleanup-utask code. Signed-off-by: Oleg Nesterov cked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2c1ff05af6f5..41f048c91425 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1496,22 +1496,19 @@ static void handle_swbp(struct pt_regs *regs) if (!utask) goto cleanup_ret; } - utask->active_uprobe = uprobe; + handler_chain(uprobe, regs); if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) goto cleanup_ret; - utask->state = UTASK_SSTEP; if (!pre_ssout(uprobe, regs, bp_vaddr)) { arch_uprobe_enable_step(&uprobe->arch); + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; return; } cleanup_ret: - if (utask) { - utask->active_uprobe = NULL; - utask->state = UTASK_RUNNING; - } if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) /* -- cgit v1.2.2 From 0578a97098dab967ece4b025fe5eb4984c4c86c0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 18:31:23 +0200 Subject: uprobes: Fix UPROBE_SKIP_SSTEP checks in handle_swbp() If handle_swbp()->add_utask() fails but UPROBE_SKIP_SSTEP is set, cleanup_ret: path do not restart the insn, this is wrong. Remove this check and add the additional label for can_skip_sstep() = T case. Note also that UPROBE_SKIP_SSTEP can be false positive, we simply can not trust it unless arch_uprobe_skip_sstep() was already called. Also, move another UPROBE_SKIP_SSTEP check before can_skip_sstep() into this helper, this looks more clean and understandable. Note: probably we should rename "skip" to "emulate" and I think that "clear UPROBE_SKIP_SSTEP" should be moved to arch_can_skip. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 41f048c91425..d2392968d4e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1389,10 +1389,11 @@ bool uprobe_deny_signal(void) */ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - - uprobe->flags &= ~UPROBE_SKIP_SSTEP; + if (uprobe->flags & UPROBE_SKIP_SSTEP) { + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + return true; + uprobe->flags &= ~UPROBE_SKIP_SSTEP; + } return false; } @@ -1494,12 +1495,12 @@ static void handle_swbp(struct pt_regs *regs) utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto cleanup_ret; + goto restart; } handler_chain(uprobe, regs); - if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) - goto cleanup_ret; + if (can_skip_sstep(uprobe, regs)) + goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) { arch_uprobe_enable_step(&uprobe->arch); @@ -1508,15 +1509,13 @@ static void handle_swbp(struct pt_regs *regs) return; } -cleanup_ret: - if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); - +restart: + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); +out: put_uprobe(uprobe); } -- cgit v1.2.2 From 1b08e907211cdc744f54871736005d9f3e7f182c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 18:52:10 +0200 Subject: uprobes: Kill UTASK_BP_HIT state Kill UTASK_BP_HIT state, it buys nothing but complicates the code. It is only used in uprobe_notify_resume() to decide who should be called, we can check utask->active_uprobe != NULL instead. And this allows us to simplify handle_swbp(), no need to clear utask->state. Likewise we could kill UTASK_SSTEP, but UTASK_BP_HIT is worse and imho should die. The problem is, it creates the special case when task->utask is NULL, we can't distinguish RUNNING and BP_HIT. With this patch utask == NULL always means RUNNING. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d2392968d4e6..d3f5381e7482 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1469,10 +1469,6 @@ static void handle_swbp(struct pt_regs *regs) bp_vaddr = uprobe_get_swbp_addr(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); - utask = current->utask; - if (utask) - utask->state = UTASK_RUNNING; - if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -1491,6 +1487,7 @@ static void handle_swbp(struct pt_regs *regs) return; } + utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ @@ -1547,13 +1544,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) } /* - * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on - * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and - * allows the thread to return from interrupt. + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and + * allows the thread to return from interrupt. After that handle_swbp() + * sets utask->active_uprobe. * - * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and - * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from - * interrupt. + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag + * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). @@ -1563,10 +1559,10 @@ void uprobe_notify_resume(struct pt_regs *regs) struct uprobe_task *utask; utask = current->utask; - if (!utask || utask->state == UTASK_BP_HIT) - handle_swbp(regs); - else + if (utask && utask->active_uprobe) handle_singlestep(utask, regs); + else + handle_swbp(regs); } /* @@ -1575,17 +1571,10 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - struct uprobe_task *utask; - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) return 0; - utask = current->utask; - if (utask) - utask->state = UTASK_BP_HIT; - set_thread_flag(TIF_UPROBE); - return 1; } -- cgit v1.2.2 From db023ea595015058270be6a62fe60a7b6b5c50d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 19:05:46 +0200 Subject: uprobes: Move clear_thread_flag(TIF_UPROBE) to uprobe_notify_resume() Move clear_thread_flag(TIF_UPROBE) from do_notify_resume() to uprobe_notify_resume() for !CONFIG_UPROBES case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d3f5381e7482..198d732ab901 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1558,6 +1558,8 @@ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; + clear_thread_flag(TIF_UPROBE); + utask = current->utask; if (utask && utask->active_uprobe) handle_singlestep(utask, regs); -- cgit v1.2.2 From 75ed82ea53bd0d2d8083261123576250f7ba851e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 16 Sep 2012 17:20:06 +0200 Subject: uprobes: Change write_opcode() to use FOLL_FORCE write_opcode()->get_user_pages() needs FOLL_FORCE to ensure we can read the page even if the probed task did mprotect(PROT_NONE) after uprobe_register(). Without FOLL_WRITE, FOLL_FORCE doesn't have any side effect but allows to read the !VM_READ memory. Otherwiese the subsequent uprobe_unregister()->set_orig_insn() fails and we leak "int3". If that task does mprotect(PROT_READ | EXEC) and execute the probed insn later it will be killed. Note: in fact this is also needed for _register, see the next patch. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 198d732ab901..80e8c7b697b9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -221,7 +221,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; -- cgit v1.2.2 From 78a320542e6cdb2800cd736b2d136e4261d34f43 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 16 Sep 2012 19:07:41 +0200 Subject: uprobes: Change valid_vma() to demand VM_MAYEXEC rather than VM_EXEC uprobe_register() or uprobe_mmap() requires VM_READ | VM_EXEC, this is not right. An apllication can do mprotect(PROT_EXEC) later and execute this code. Change valid_vma(is_register => true) to check VM_MAYEXEC instead. No need to check VM_MAYREAD, it is always set. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 80e8c7b697b9..a9de40815391 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -106,8 +106,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) - == (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_HUGETLB | VM_WRITE | VM_MAYEXEC | VM_SHARED)) + == VM_MAYEXEC) return true; return false; -- cgit v1.2.2 From e40cfce626a5537994058ee9a940dcfdc0f68ef0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 16 Sep 2012 19:31:39 +0200 Subject: uprobes: Restrict valid_vma(false) to skip VM_SHARED vmas valid_vma(false) ignores ->vm_flags, this is not actually right. We should never try to write into MAP_SHARED mapping, this can confuse an apllication which actually writes to ->vm_file. With this patch valid_vma(false) ignores VM_WRITE only but checks other (immutable) bits checked by valid_vma(true). This can also speedup uprobe_munmap() and uprobe_unregister(). Note: even after this patch _unregister can confuse the probed application if it does mprotect(PROT_WRITE) after _register and installs "int3", but this is hardly possible to avoid and this doesn't differ from gdb case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a9de40815391..8d182bdecc2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -100,17 +100,12 @@ struct uprobe { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - if (!vma->vm_file) - return false; - - if (!is_register) - return true; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; - if ((vma->vm_flags & (VM_HUGETLB | VM_WRITE | VM_MAYEXEC | VM_SHARED)) - == VM_MAYEXEC) - return true; + if (is_register) + flags |= VM_WRITE; - return false; + return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) -- cgit v1.2.2 From e97f65a17deafacc360a4cb75ae944897ecea6d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 19 Sep 2012 16:36:01 +0200 Subject: uprobes: Kill set_swbp()->is_swbp_at_addr() A separate patch for better documentation. set_swbp()->is_swbp_at_addr() is not needed for correctness, it is harmless to do the unnecessary __replace_page(old_page, new_page) when these 2 pages are identical. And it can not be counted as optimization. mmap/register races are very unlikely, while in the likely case is_swbp_at_addr() adds the extra get_user_pages() even if the caller is uprobe_mmap(current->mm) and returns false. Note also that the semantics/usage of is_swbp_at_addr() in uprobe.c is confusing. set_swbp() uses it to detect the case when this insn was already modified by uprobes, that is why it should always compare the opcode with UPROBE_SWBP_INSN even if the hardware (like powerpc) has other trap insns. It doesn't matter if this breakpoint was in fact installed by gdb or application itself, we are going to "steal" this breakpoint anyway and execute the original insn from vm_file even if it no longer matches the memory. OTOH, handle_swbp()->find_active_uprobe() uses is_swbp_at_addr() to figure out whether we need to send SIGTRAP or not if we can not find uprobe, so in this case it should return true for all trap variants, not only for UPROBE_SWBP_INSN. This patch removes set_swbp()->is_swbp_at_addr(), the next patches will remove it from set_orig_insn() which is similar to set_swbp() in this respect. So the only caller will be handle_swbp() and we can make its semantics clear. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d182bdecc2e..a4453d1c8199 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -321,17 +321,6 @@ out: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - int result; - /* - * See the comment near uprobes_hash(). - */ - result = is_swbp_at_addr(mm, vaddr); - if (result == 1) - return 0; - - if (result) - return result; - return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } -- cgit v1.2.2 From cceb55aab73d2aea8f4d6f7414d2e1b647a3dacb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 23 Sep 2012 21:10:18 +0200 Subject: uprobes: Introduce copy_opcode(), kill read_opcode() No functional changes, preparations. 1. Extract the kmap-and-memcpy code from read_opcode() into the new trivial helper, copy_opcode(). The next patch will add another user. 2. read_opcode() becomes really trivial, fold it into its single caller, is_swbp_at_addr(). 3. Remove "auprobe" argument from write_opcode(), it is not used since f403072c6. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 63 +++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a4453d1c8199..b6f0f716a884 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -183,19 +183,25 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } +static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +{ + void *kaddr = kmap_atomic(page); + memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + kunmap_atomic(kaddr); +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify read_opcode / + * supported by that architecture then we need to modify is_swbp_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ /* * write_opcode - write the opcode at a given virtual address. - * @auprobe: arch breakpointing information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -206,8 +212,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +static int write_opcode(struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t opcode) { struct page *old_page, *new_page; void *vaddr_old, *vaddr_new; @@ -253,40 +259,9 @@ put_old: return ret; } -/** - * read_opcode - read the opcode at a given virtual address. - * @mm: the probed process address space. - * @vaddr: the virtual address to read the opcode. - * @opcode: location to store the read opcode. - * - * Called with mm->mmap_sem held (for read and with a reference to - * mm. - * - * For mm @mm, read the opcode at @vaddr and store it in @opcode. - * Return 0 (success) or a negative errno. - */ -static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) -{ - struct page *page; - void *vaddr_new; - int ret; - - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); - if (ret <= 0) - return ret; - - vaddr_new = kmap_atomic(page); - vaddr &= ~PAGE_MASK; - memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); - kunmap_atomic(vaddr_new); - - put_page(page); - - return 0; -} - static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) { + struct page *page; uprobe_opcode_t opcode; int result; @@ -300,14 +275,14 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) goto out; } - result = read_opcode(mm, vaddr, &opcode); - if (result) + result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + if (result < 0) return result; -out: - if (is_swbp_insn(&opcode)) - return 1; - return 0; + copy_opcode(page, vaddr, &opcode); + put_page(page); + out: + return is_swbp_insn(&opcode); } /** @@ -321,7 +296,7 @@ out: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -345,7 +320,7 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v if (result != 1) return result; - return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) -- cgit v1.2.2 From ed6f6a50dc5f183c53e7b3b7fed4794bc50d9aa7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 23 Sep 2012 21:30:44 +0200 Subject: uprobes: Kill set_orig_insn()->is_swbp_at_addr() Unlike set_swbp(), set_orig_insn()->is_swbp_at_addr() makes sense, although it can't prevent all confusions. But the usage of is_swbp_at_addr() is equally confusing, and it adds the extra get_user_pages() we can avoid. This patch removes set_orig_insn()->is_swbp_at_addr() but changes write_opcode() to do the necessary checks before replace_page(). Perhaps it also makes sense to ensure PAGE_MAPPING_ANON in unregister case. find_active_uprobe() becomes the only user of is_swbp_at_addr(), we can change its semantics. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b6f0f716a884..9248ee76b4bb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -190,6 +190,25 @@ static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t kunmap_atomic(kaddr); } +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +{ + uprobe_opcode_t old_opcode; + bool is_swbp; + + copy_opcode(page, vaddr, &old_opcode); + is_swbp = is_swbp_insn(&old_opcode); + + if (is_swbp_insn(new_opcode)) { + if (is_swbp) /* register: already installed? */ + return 0; + } else { + if (!is_swbp) /* unregister: was it changed by us? */ + return -EINVAL; + } + + return 1; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -226,6 +245,10 @@ retry: if (ret <= 0) return ret; + ret = verify_opcode(old_page, vaddr, &opcode); + if (ret <= 0) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) @@ -311,15 +334,6 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - int result; - - result = is_swbp_at_addr(mm, vaddr); - if (!result) - return -EINVAL; - - if (result != 1) - return result; - return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } -- cgit v1.2.2 From ec75fba93ef0c00c91545b5e53841a80cffad0c4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 23 Sep 2012 21:55:19 +0200 Subject: uprobes: Simplify is_swbp_at_addr(), remove stale comments After the previous change is_swbp_at_addr() is always called with current->mm. Remove this check and move it close to its single caller. Also, remove the obsolete comment about is_swbp_at_addr() and uprobe_state.count. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 73 ++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9248ee76b4bb..6136854da6c6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -282,32 +282,6 @@ put_old: return ret; } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) -{ - struct page *page; - uprobe_opcode_t opcode; - int result; - - if (current->mm == mm) { - pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); - pagefault_enable(); - - if (likely(result == 0)) - goto out; - } - - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); - if (result < 0) - return result; - - copy_opcode(page, vaddr, &opcode); - put_page(page); - out: - return is_swbp_insn(&opcode); -} - /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. @@ -589,29 +563,6 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } -/* - * How mm->uprobes_state.count gets updated - * uprobe_mmap() increments the count if - * - it successfully adds a breakpoint. - * - it cannot add a breakpoint, but sees that there is a underlying - * breakpoint (via a is_swbp_at_addr()). - * - * uprobe_munmap() decrements the count if - * - it sees a underlying breakpoint, (via is_swbp_at_addr) - * (Subsequent uprobe_unregister wouldnt find the breakpoint - * unless a uprobe_mmap kicks in, since the old vma would be - * dropped just after uprobe_munmap.) - * - * uprobe_register increments the count if: - * - it successfully adds a breakpoint. - * - * uprobe_unregister decrements the count if: - * - it sees a underlying breakpoint and removes successfully. - * (via is_swbp_at_addr) - * (Subsequent uprobe_munmap wouldnt find the breakpoint - * since there is no underlying breakpoint after the - * breakpoint removal.) - */ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) @@ -1389,6 +1340,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + uprobe_opcode_t opcode; + int result; + + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + + result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + if (result < 0) + return result; + + copy_opcode(page, vaddr, &opcode); + put_page(page); + out: + return is_swbp_insn(&opcode); +} + static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; -- cgit v1.2.2 From 2aa3a7f8660355c3dddead17e224545c1a3d5a5f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Sep 2012 19:55:31 -0400 Subject: preparation for generic kernel_thread() Let architectures select GENERIC_KERNEL_THREAD and have their copy_thread() treat NULL regs as "it came from kernel_thread(), sp argument contains the function new thread will be calling and stack_size - the argument for that function". Switching the architectures begins shortly... Signed-off-by: Al Viro --- kernel/fork.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..a42c62a8eb24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1609,7 +1609,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1659,6 +1659,17 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifdef CONFIG_GENERIC_KERNEL_THREAD +/* + * Create a kernel thread. + */ +pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + (unsigned long)arg, NULL, NULL); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -- cgit v1.2.2 From 16a8016372c42c7628eb4a39d75386a461e8c5d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 1 Jun 2012 14:22:01 -0400 Subject: sanitize tsk_is_polling() Make default just return 0. The current default (checking TIF_POLLING_NRFLAG) is taken to architectures that need it; ones that don't do polling in their idle threads don't need to defined TIF_POLLING_NRFLAG at all. ia64 defined both TS_POLLING (used by its tsk_is_polling()) and TIF_POLLING_NRFLAG (not used at all). Killed the latter... Signed-off-by: Al Viro --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 649c9f876cb1..9dcf93c24d10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,7 +505,7 @@ static inline void init_hrtick(void) #ifdef CONFIG_SMP #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0 #endif void resched_task(struct task_struct *p) -- cgit v1.2.2 From 2b17c545a4cdbbbadcd7f1e9684c2d7db8f085a6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Oct 2012 01:46:44 +0200 Subject: nohz: Fix one jiffy count too far in idle cputime When we stop the tick in idle, we save the current jiffies value in ts->idle_jiffies. This snapshot is substracted from the later value of jiffies when the tick is restarted and the resulting delta is accounted as idle cputime. This is how we handle the idle cputime accounting without the tick. But sometimes we need to schedule the next tick to some time in the future instead of completely stopping it. In this case, a tick may happen before we restart the periodic behaviour and from that tick we account one jiffy to idle cputime as usual but we also increment the ts->idle_jiffies snapshot by one so that when we compute the delta to account, we substract the one jiffy we just accounted. To prepare for stopping the tick outside idle, we introduced a check that prevents from fixing up that ts->idle_jiffies if we are not running the idle task. But we use idle_cpu() for that and this is a problem if we run the tick while another CPU remotely enqueues a ttwu to our runqueue: CPU 0: CPU 1: tick_sched_timer() { ttwu_queue_remote() if (idle_cpu(CPU 0)) ts->idle_jiffies++; } Here, idle_cpu() notes that &rq->wake_list is not empty and hence won't consider the CPU as idle. As a result, ts->idle_jiffies won't be incremented. But this is wrong because we actually account the current jiffy to idle cputime. And that jiffy won't get substracted from the nohz time delta. So in the end, this jiffy is accounted twice. Fix this by changing idle_cpu(smp_processor_id()) with is_idle_task(current). This way the jiffy is substracted correctly even if a ttwu operation is enqueued on the CPU. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: # 3.5+ Link: http://lkml.kernel.org/r/1349308004-3482-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f423bdd035c2..a40260885265 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - if (idle_cpu(cpu)) + if (is_idle_task(current)) ts->idle_jiffies++; } update_process_times(user_mode(regs)); -- cgit v1.2.2 From 5f7865f3e44db4c73fdc454fb2af40806212a7ca Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 25 Sep 2012 21:12:30 +0800 Subject: sched: Ensure 'sched_domains_numa_levels' is safe to use in other functions We should temporarily reset 'sched_domains_numa_levels' to 0 after it is reset to 'level' in sched_init_numa(). If it fails to allocate memory for array sched_domains_numa_masks[][], the array will contain less then 'level' members. This could be dangerous when we use it to iterate array sched_domains_numa_masks[][] in other functions. This patch set sched_domains_numa_levels to 0 before initializing array sched_domains_numa_masks[][], and reset it to 'level' when sched_domains_numa_masks[][] is fully initialized. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1348578751-16904-2-git-send-email-tangchen@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c17747236438..f895fdd32c5b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6122,6 +6122,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6176,6 +6187,8 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; } #else static inline void sched_init_numa(void) -- cgit v1.2.2 From 301a5cba2887d1f640e6d5184b05a6d7132017d5 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 25 Sep 2012 21:12:31 +0800 Subject: sched: Update sched_domains_numa_masks[][] when new cpus are onlined Once array sched_domains_numa_masks[] []is defined, it is never updated. When a new cpu on a new node is onlined, the coincident member in sched_domains_numa_masks[][] is not initialized, and all the masks are 0. As a result, the build_overlap_sched_groups() will initialize a NULL sched_group for the new cpu on the new node, which will lead to kernel panic: [ 3189.403280] Call Trace: [ 3189.403286] [] warn_slowpath_common+0x7f/0xc0 [ 3189.403289] [] warn_slowpath_null+0x1a/0x20 [ 3189.403292] [] build_sched_domains+0x467/0x470 [ 3189.403296] [] partition_sched_domains+0x307/0x510 [ 3189.403299] [] ? partition_sched_domains+0x142/0x510 [ 3189.403305] [] cpuset_update_active_cpus+0x83/0x90 [ 3189.403308] [] cpuset_cpu_active+0x38/0x70 [ 3189.403316] [] notifier_call_chain+0x67/0x150 [ 3189.403320] [] ? native_cpu_up+0x18a/0x1b5 [ 3189.403328] [] __raw_notifier_call_chain+0xe/0x10 [ 3189.403333] [] __cpu_notify+0x20/0x40 [ 3189.403337] [] _cpu_up+0xe9/0x131 [ 3189.403340] [] cpu_up+0xdb/0xee [ 3189.403348] [] store_online+0x9c/0xd0 [ 3189.403355] [] dev_attr_store+0x20/0x30 [ 3189.403361] [] sysfs_write_file+0xa3/0x100 [ 3189.403368] [] vfs_write+0xd0/0x1a0 [ 3189.403371] [] sys_write+0x54/0xa0 [ 3189.403375] [] system_call_fastpath+0x16/0x1b [ 3189.403377] ---[ end trace 1e6cf85d0859c941 ]--- [ 3189.403398] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 This patch registers a new notifier for cpu hotplug notify chain, and updates sched_domains_numa_masks every time a new cpu is onlined or offlined. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang [ fixed compile warning ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1348578751-16904-3-git-send-email-tangchen@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f895fdd32c5b..8322d73b4392 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6190,10 +6190,65 @@ static void sched_init_numa(void) sched_domains_numa_levels = level; } + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -6642,6 +6697,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); -- cgit v1.2.2 From 3f1f33206c16c7b3839d71372bc2ac3f305aa802 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Oct 2012 15:38:52 +0200 Subject: perf: Clarify perf_cpu_context::active_pmu usage by renaming it to ::unique_pmu Stephane thought the perf_cpu_context::active_pmu name confusing and suggested using 'unique_pmu' instead. This pointer is a pointer to a 'random' pmu sharing the cpuctx instance, therefore limiting a for_each_pmu loop to those where cpuctx->unique_pmu matches the pmu we get a loop over unique cpuctx instances. Suggested-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kxyjqpfj2fn9gt7kwu5ag9ks@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7b9df353ba1b..81939e8999a5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4419,7 +4419,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4565,7 +4565,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4761,7 +4761,7 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -5862,8 +5862,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; + if (cpuctx->unique_pmu == old_pmu) + cpuctx->unique_pmu = pmu; } } @@ -5998,7 +5998,7 @@ skip_type: cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; + cpuctx->unique_pmu = pmu; } got_cpu_context: -- cgit v1.2.2 From 95cf59ea72331d0093010543b8951bb43f262cac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Oct 2012 15:41:23 +0200 Subject: perf: Fix perf_cgroup_switch for sw-events Jiri reported that he could trigger the WARN_ON_ONCE() in perf_cgroup_switch() using sw-events. This is because sw-events share a cpuctx with multiple PMUs. Use the ->unique_pmu pointer to limit the pmu iteration to unique cpuctx instances. Reported-and-Tested-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-so7wi2zf3jjzrwcutm2mkz0j@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 81939e8999a5..fd15593c7f54 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -372,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + continue; /* ensure we process each cpuctx once */ /* * perf_cgroup_events says at least one @@ -395,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) if (mode & PERF_CGROUP_SWIN) { WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around */ cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); -- cgit v1.2.2 From f96972f2dc6365421cf2366ebd61ee4cf060c8d5 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Thu, 4 Oct 2012 17:12:23 -0700 Subject: kernel/sys.c: call disable_nonboot_cpus() in kernel_restart() As kernel_power_off() calls disable_nonboot_cpus(), we may also want to have kernel_restart() call disable_nonboot_cpus(). Doing so can help machines that require boot cpu be the last alive cpu during reboot to survive with kernel restart. This fixes one reboot issue seen on imx6q (Cortex-A9 Quad). The machine requires that the restart routine be run on the primary cpu rather than secondary ones. Otherwise, the secondary core running the restart routine will fail to come to online after reboot. Signed-off-by: Shawn Guo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f9492284e5d2..fdad206165d0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + disable_nonboot_cpus(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else -- cgit v1.2.2 From 6c0c0d4d1080840eabb3d055d2fd81911111c5fd Mon Sep 17 00:00:00 2001 From: hongfeng Date: Thu, 4 Oct 2012 17:12:25 -0700 Subject: poweroff: fix bug in orderly_poweroff() orderly_poweroff is trying to poweroff platform in two steps: step 1: Call user space application to poweroff step 2: If user space poweroff fail, then do a force power off if force param is set. The bug here is, step 1 is always successful with param UMH_NO_WAIT, which obey the design goal of orderly_poweroff. We have two choices here: UMH_WAIT_EXEC which means wait for the exec, but not the process; UMH_WAIT_PROC which means wait for the process to complete. we need to trade off the two choices: If using UMH_WAIT_EXEC, there is potential issue comments by Serge E. Hallyn: The exec will have started, but may for whatever (very unlikely) reason fail. If using UMH_WAIT_PROC, there is potential issue comments by Eric W. Biederman: If the caller is not running in a kernel thread then we can easily get into a case where the user space caller will block waiting for us when we are waiting for the user space caller. Thanks for their excellent ideas, based on the above discussion, we finally choose UMH_WAIT_EXEC, which is much more safe, if the user application really fails, we just complain the application itself, it seems a better choice here. Signed-off-by: Feng Hong Acked-by: Kees Cook Acked-by: Serge Hallyn Cc: "Eric W. Biederman" Acked-by: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index fdad206165d0..c5cb5b99cb81 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2205,7 +2205,7 @@ static int __orderly_poweroff(void) return -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, NULL, argv_cleanup, NULL); if (ret == -ENOMEM) argv_free(argv); -- cgit v1.2.2 From 046d662f481830e652ac34cd112249adde16452a Mon Sep 17 00:00:00 2001 From: Alex Kelly Date: Thu, 4 Oct 2012 17:15:23 -0700 Subject: coredump: make core dump functionality optional Adds an expert Kconfig option, CONFIG_COREDUMP, which allows disabling of core dump. This saves approximately 2.6k in the compiled kernel, and complements CONFIG_ELF_CORE, which now depends on it. CONFIG_COREDUMP also disables coredump-related sysctls, except for suid_dumpable and related functions, which are necessary for ptrace. [akpm@linux-foundation.org: fix binfmt_aout.c build] Signed-off-by: Alex Kelly Reviewed-by: Josh Triplett Acked-by: Serge Hallyn Acked-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 84c76a34e41c..c2a2f8084bad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,10 +97,12 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int max_threads; -extern int core_uses_pid; extern int suid_dumpable; +#ifdef CONFIG_COREDUMP +extern int core_uses_pid; extern char core_pattern[]; extern unsigned int core_pipe_limit; +#endif extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_COREDUMP { .procname = "core_uses_pid", .data = &core_uses_pid, @@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -2036,12 +2042,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { +#ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMPABLE_SAFE && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ "core dump path required.\n"); } +#endif } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, @@ -2053,6 +2061,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, return error; } +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2061,6 +2070,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, validate_coredump_safety(); return error; } +#endif static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, -- cgit v1.2.2 From 179899fd5dc780fe3bcd44d0eb7823e3d855c855 Mon Sep 17 00:00:00 2001 From: Alex Kelly Date: Thu, 4 Oct 2012 17:15:24 -0700 Subject: coredump: update coredump-related headers Create a new header file, fs/coredump.h, which contains functions only used by the new coredump.c. It also moves do_coredump to the include/linux/coredump.h header file, for consistency. Signed-off-by: Alex Kelly Reviewed-by: Josh Triplett Acked-by: Serge Hallyn Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2c681f11b7d2..2ad3f5904bd7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.2 From 5ab1c309b344880d81494e9eab7fb27682bc6d9d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 4 Oct 2012 17:15:29 -0700 Subject: coredump: pass siginfo_t* to do_coredump() and below, not merely signr This is a preparatory patch for the introduction of NT_SIGINFO elf note. With this patch we pass "siginfo_t *siginfo" instead of "int signr" to do_coredump() and put it into coredump_params. It will be used by the next patch. Most changes are simple s/signr/siginfo->si_signo/. Signed-off-by: Denys Vlasenko Reviewed-by: Oleg Nesterov Cc: Amerigo Wang Cc: "Jonathan M. Foote" Cc: Roland McGrath Cc: Pedro Alves Cc: Fengguang Wu Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2ad3f5904bd7..0af8868525d6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2360,7 +2360,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info->si_signo, info->si_signo, regs); + do_coredump(info, regs); } /* -- cgit v1.2.2 From de4ec99c32cca755a11f91abb86ed94ce11f2e60 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 4 Oct 2012 17:15:47 -0700 Subject: kdump: remove unneeded include The inclusion of is unnecessary. Signed-off-by: Wei Yongjun Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d6413..5e4bd7864c5d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.2 From 0324b5a450f8a58304e93c5d886add24ca3527bc Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 4 Oct 2012 17:16:52 -0700 Subject: taskstats: cgroupstats_user_cmd() may leak on error If prepare_reply() succeeds we have allocated memory for 'rep_skb'. If nla_reserve() then subsequently fails and returns NULL we fail to release the memory we allocated, thus causing a leak. Signed-off-by: Jesper Juhl Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 610f0838d555..145bb4d3bd4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -445,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { + nlmsg_free(rep_skb); rc = -EMSGSIZE; goto err; } -- cgit v1.2.2 From 4965f5667f36a95b41cda6638875bc992bd7d18b Mon Sep 17 00:00:00 2001 From: T Makphaibulchoke Date: Thu, 4 Oct 2012 17:16:55 -0700 Subject: kernel/resource.c: fix stack overflow in __reserve_region_with_split() Using a recursive call add a non-conflicting region in __reserve_region_with_split() could result in a stack overflow in the case that the recursive calls are too deep. Convert the recursive calls to an iterative loop to avoid the problem. Tested on a machine containing 135 regions. The kernel no longer panicked with stack overflow. Also tested with code arbitrarily adding regions with no conflict, embedding two consecutive conflicts and embedding two non-consecutive conflicts. Signed-off-by: T Makphaibulchoke Reviewed-by: Ram Pai Cc: Paul Gortmaker Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee84..73f35d4b30b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *parent = root; struct resource *conflict; struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *next_res = NULL; if (!res) return; @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root, res->end = end; res->flags = IORESOURCE_BUSY; - conflict = __request_resource(parent, res); - if (!conflict) - return; + while (1) { - /* failed, split and try again */ - kfree(res); + conflict = __request_resource(parent, res); + if (!conflict) { + if (!next_res) + break; + res = next_res; + next_res = NULL; + continue; + } - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; + /* conflict covered whole area */ + if (conflict->start <= res->start && + conflict->end >= res->end) { + kfree(res); + WARN_ON(next_res); + break; + } + + /* failed, split and try again */ + if (conflict->start > res->start) { + end = res->end; + res->end = conflict->start - 1; + if (conflict->end < end) { + next_res = kzalloc(sizeof(*next_res), + GFP_ATOMIC); + if (!next_res) { + kfree(res); + break; + } + next_res->name = name; + next_res->start = conflict->end + 1; + next_res->end = end; + next_res->flags = IORESOURCE_BUSY; + } + } else { + res->start = conflict->end + 1; + } + } - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); } void __init reserve_region_with_split(struct resource *root, -- cgit v1.2.2 From a5f658b71bc622b731961ea3addcf146ed3c599f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Sep 2012 18:21:09 +0200 Subject: uprobes: Don't return success if alloc_uprobe() fails If alloc_uprobe() fails uprobe_register() should return ENOMEM, not 0. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6136854da6c6..588a5575d64c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -813,7 +813,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - if (uprobe && !consumer_add(uprobe, uc)) { + if (!uprobe) { + ret = -ENOMEM; + } else if (!consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { uprobe->consumers = NULL; -- cgit v1.2.2 From 076a365b3da99b68c5d58e394714d0611f1fa002 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Sep 2012 18:54:53 +0200 Subject: uprobes: Do not delete uprobe if uprobe_unregister() fails delete_uprobe() must not be called if register_for_each_vma(false) fails to remove all breakpoints, __uprobe_unregister() is correct. The problem is that register_for_each_vma(false) always returns 0 and thus this logic does not work. 1. Change verify_opcode() to return 0 rather than -EINVAL when unregister detects the !is_swbp insn, we can treat this case as success and currently unregister paths ignore the error code anyway. 2. Change remove_breakpoint() to propagate the error code from write_opcode(). 3. Change register_for_each_vma(is_register => false) to remove as much breakpoints as possible but return non-zero if remove_breakpoint() fails at least once. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 588a5575d64c..a1b466d17c17 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -203,7 +203,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t return 0; } else { if (!is_swbp) /* unregister: was it changed by us? */ - return -EINVAL; + return 0; } return 1; @@ -616,15 +616,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static void +static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { /* can happen if uprobe_register() fails */ if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return; + return 0; set_bit(MMF_RECALC_UPROBES, &mm->flags); - set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, mm, vaddr); } /* @@ -740,7 +740,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) struct mm_struct *mm = info->mm; struct vm_area_struct *vma; - if (err) + if (err && is_register) goto free; down_write(&mm->mmap_sem); @@ -756,7 +756,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) err = install_breakpoint(uprobe, mm, vma, info->vaddr); else - remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, mm, info->vaddr); unlock: up_write(&mm->mmap_sem); -- cgit v1.2.2 From 142b18ddc81439acda4bc4231b291e99fe67d507 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 29 Sep 2012 21:56:57 +0200 Subject: uprobes: Fix handle_swbp() vs unregister() + register() race Strictly speaking this race was added by me in 56bb4cf6. However I think that this bug is just another indication that we should move copy_insn/uprobe_analyze_insn code from install_breakpoint() to uprobe_register(), there are a lot of other reasons for that. Until then, add a hack to close the race. A task can hit uprobe U1, but before it calls find_uprobe() this uprobe can be unregistered *AND* another uprobe U2 can be added to uprobes_tree at the same inode/offset. In this case handle_swbp() will use the not-fully-initialized U2, in particular its arch.insn for xol. Add the additional !UPROBE_COPY_INSN check into handle_swbp(), if this flag is not set we simply restart as if the new uprobe was not inserted yet. This is not very nice, we need barriers, but we will remove this hack when we change uprobe_register(). Note: with or without this patch install_breakpoint() can race with itself, yet another reson to kill UPROBE_COPY_INSN altogether. And even the usage of uprobe->flags is not safe. See the next patches. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a1b466d17c17..c718fef28617 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -596,6 +596,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ uprobe->flags |= UPROBE_COPY_INSN; } @@ -1436,6 +1437,14 @@ static void handle_swbp(struct pt_regs *regs) } return; } + /* + * TODO: move copy_insn/etc into _register and remove this hack. + * After we hit the bp, _unregister + _register can install the + * new and not-yet-analyzed uprobe at the same address, restart. + */ + smp_rmb(); /* pairs with wmb() in install_breakpoint() */ + if (unlikely(!(uprobe->flags & UPROBE_COPY_INSN))) + goto restart; utask = current->utask; if (!utask) { -- cgit v1.2.2 From cb9a19fe4aa51afa34786bd383e6614fa0083d58 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Sep 2012 20:11:45 +0200 Subject: uprobes: Introduce prepare_uprobe() Preparation. Extract the copy_insn/arch_uprobe_analyze_insn code from install_breakpoint() into the new helper, prepare_uprobe(). And move uprobe->flags defines from uprobes.h to uprobes.c, nobody else can use them anyway. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 60 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c718fef28617..4f315fa94c52 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -78,6 +78,13 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); +/* Have a copy of original instruction */ +#define UPROBE_COPY_INSN 0x1 +/* Dont run handlers when first register/ last unregister in progress*/ +#define UPROBE_RUN_HANDLER 0x2 +/* Can skip singlestep */ +#define UPROBE_SKIP_SSTEP 0x4 + struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; @@ -563,6 +570,37 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } +static int prepare_uprobe(struct uprobe *uprobe, struct file *file, + struct mm_struct *mm, unsigned long vaddr) +{ + int ret = 0; + + if (uprobe->flags & UPROBE_COPY_INSN) + return ret; + + ret = copy_insn(uprobe, file); + if (ret) + goto out; + + ret = -ENOTSUPP; + if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + goto out; + + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); + if (ret) + goto out; + + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + + smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + uprobe->flags |= UPROBE_COPY_INSN; + + out: + return ret; +} + static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) @@ -580,25 +618,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return 0; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma->vm_file); - if (ret) - return ret; - - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -ENOTSUPP; - - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); - if (ret) - return ret; - - /* write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - - smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ - uprobe->flags |= UPROBE_COPY_INSN; - } + ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); + if (ret) + return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), -- cgit v1.2.2 From 4710f05fd146d4739e57a8832a3abc5bd3bf0997 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Sep 2012 20:31:41 +0200 Subject: uprobes: Fix prepare_uprobe() race with itself install_breakpoint() is called under mm->mmap_sem, this protects set_swbp() but not prepare_uprobe(). Two or more different tasks can call install_breakpoint()->prepare_uprobe() at the same time, this leads to numerous problems if UPROBE_COPY_INSN is not set. Just for example, the second copy_insn() can corrupt the already analyzed/fixuped uprobe->arch.insn and race with handle_swbp(). This patch simply adds uprobe->copy_mutex to serialize this code. We could probably reuse ->consumer_rwsem, but this would mean that consumer->handler() can not use mm->mmap_sem, not good. Note: this is another temporary ugly hack until we move this logic into uprobe_register(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4f315fa94c52..7f62b30c4172 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -89,6 +89,7 @@ struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; struct rw_semaphore consumer_rwsem; + struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ @@ -444,6 +445,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); + mutex_init(&uprobe->copy_mutex); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -578,6 +580,10 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (uprobe->flags & UPROBE_COPY_INSN) return ret; + mutex_lock(&uprobe->copy_mutex); + if (uprobe->flags & UPROBE_COPY_INSN) + goto out; + ret = copy_insn(uprobe, file); if (ret) goto out; @@ -598,6 +604,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, uprobe->flags |= UPROBE_COPY_INSN; out: + mutex_unlock(&uprobe->copy_mutex); + return ret; } -- cgit v1.2.2 From 71434f2fcba5c22d6e0d51878ba8e241a5dea5fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Sep 2012 21:12:44 +0200 Subject: uprobes: Fix the racy uprobe->flags manipulation Multiple threads can manipulate uprobe->flags, this is obviously unsafe. For example mmap can set UPROBE_COPY_INSN while register tries to set UPROBE_RUN_HANDLER, the latter can also race with can_skip_sstep() which clears UPROBE_SKIP_SSTEP. Change this code to use bitops. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7f62b30c4172..c92651d619ca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -79,11 +79,11 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; static atomic_t uprobe_events = ATOMIC_INIT(0); /* Have a copy of original instruction */ -#define UPROBE_COPY_INSN 0x1 +#define UPROBE_COPY_INSN 0 /* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER 0x2 +#define UPROBE_RUN_HANDLER 1 /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 0x4 +#define UPROBE_SKIP_SSTEP 2 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -94,7 +94,7 @@ struct uprobe { struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; - int flags; + unsigned long flags; struct arch_uprobe arch; }; @@ -423,7 +423,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) spin_unlock(&uprobes_treelock); /* For now assume that the instruction need not be single-stepped */ - uprobe->flags |= UPROBE_SKIP_SSTEP; + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); return u; } @@ -466,7 +466,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - if (!(uprobe->flags & UPROBE_RUN_HANDLER)) + if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) return; down_read(&uprobe->consumer_rwsem); @@ -577,11 +577,11 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, { int ret = 0; - if (uprobe->flags & UPROBE_COPY_INSN) + if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; mutex_lock(&uprobe->copy_mutex); - if (uprobe->flags & UPROBE_COPY_INSN) + if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; ret = copy_insn(uprobe, file); @@ -601,7 +601,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ - uprobe->flags |= UPROBE_COPY_INSN; + set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: mutex_unlock(&uprobe->copy_mutex); @@ -852,7 +852,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * uprobe->consumers = NULL; __uprobe_unregister(uprobe); } else { - uprobe->flags |= UPROBE_RUN_HANDLER; + set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } } @@ -885,7 +885,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (consumer_del(uprobe, uc)) { if (!uprobe->consumers) { __uprobe_unregister(uprobe); - uprobe->flags &= ~UPROBE_RUN_HANDLER; + clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } } @@ -1346,10 +1346,10 @@ bool uprobe_deny_signal(void) */ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) { - if (uprobe->flags & UPROBE_SKIP_SSTEP) { + if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) return true; - uprobe->flags &= ~UPROBE_SKIP_SSTEP; + clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); } return false; } @@ -1473,7 +1473,7 @@ static void handle_swbp(struct pt_regs *regs) * new and not-yet-analyzed uprobe at the same address, restart. */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ - if (unlikely(!(uprobe->flags & UPROBE_COPY_INSN))) + if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto restart; utask = current->utask; -- cgit v1.2.2 From a4fbe35a124526e6759be07bd9c7ea796ba1e00d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 7 Oct 2012 08:36:12 -0700 Subject: rcu: Grace-period initialization excludes only RCU notifier Kirill noted the following deadlock cycle on shutdown involving padata: > With commit 755609a9087fa983f567dc5452b2fa7b089b591f I've got deadlock on > poweroff. > > It guess it happens because of race for cpu_hotplug.lock: > > CPU A CPU B > disable_nonboot_cpus() > _cpu_down() > cpu_hotplug_begin() > mutex_lock(&cpu_hotplug.lock); > __cpu_notify() > padata_cpu_callback() > __padata_remove_cpu() > padata_replace() > synchronize_rcu() > rcu_gp_kthread() > get_online_cpus(); > mutex_lock(&cpu_hotplug.lock); It would of course be good to eliminate grace-period delays from CPU-hotplug notifiers, but that is a separate issue. Deadlock is not an appropriate diagnostic for excessive CPU-hotplug latency. Fortunately, grace-period initialization does not actually need to exclude all of the CPU-hotplug operation, but rather only RCU's own CPU_UP_PREPARE and CPU_DEAD CPU-hotplug notifiers. This commit therefore introduces a new per-rcu_state onoff_mutex that provides the required concurrency control in place of the get_online_cpus() that was previously in rcu_gp_init(). Reported-by: "Kirill A. Shutemov" Signed-off-by: Paul E. McKenney Tested-by: Kirill A. Shutemov --- kernel/rcutree.c | 21 ++++++++++----------- kernel/rcutree.h | 6 ++++++ 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4fb2376ddf06..74df86bd9204 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ } @@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - get_online_cpus(); + mutex_lock(&rsp->onoff_mutex); /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp) cond_resched(); } - put_online_cpus(); + mutex_unlock(&rsp->onoff_mutex); return 1; } @@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ + mutex_lock(&rsp->onoff_mutex); raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ @@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) init_callback_list(rdp); /* Disallow further callbacks on this CPU. */ rdp->nxttail[RCU_NEXT_TAIL] = NULL; + mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); + /* Exclude new grace periods. */ + mutex_lock(&rsp->onoff_mutex); + /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ @@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* - * A new grace period might start here. If so, we won't be part - * of it, but that is OK, as we are currently in a quiescent state. - */ - - /* Exclude any attempts to start a new GP on large systems. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - /* Add CPU to rcu_node bitmasks. */ rnp = rdp->mynode; mask = rdp->grpmask; @@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + local_irq_restore(flags); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + mutex_unlock(&rsp->onoff_mutex); } static void __cpuinit rcu_prepare_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5faf05d68326..a240f032848e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -394,11 +394,17 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ + /* End of fields guarded by onofflock. */ + + struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ + /* End of fields guarded by barrier_mutex. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ -- cgit v1.2.2 From 7ac57a89de958fbb5271dc504d0c25e34dbeec32 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:16 -0700 Subject: Kconfig: clean up the "#if defined(arch)" list for exception-trace sysctl entry Introduce SYSCTL_EXCEPTION_TRACE config option and selec it in the architectures requiring support for the "exception-trace" debug_table entry in kernel/sysctl.c. Signed-off-by: Catalin Marinas Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c2a2f8084bad..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1549,8 +1549,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64) +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.2.2 From 075663d19885eb3738fd2d7dbdb8947e12563b68 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 8 Oct 2012 16:28:20 -0700 Subject: CPU hotplug, debug: detect imbalance between get_online_cpus() and put_online_cpus() The synchronization between CPU hotplug readers and writers is achieved by means of refcounting, safeguarded by the cpu_hotplug.lock. get_online_cpus() increments the refcount, whereas put_online_cpus() decrements it. If we ever hit an imbalance between the two, we end up compromising the guarantees of the hotplug synchronization i.e, for example, an extra call to put_online_cpus() can end up allowing a hotplug reader to execute concurrently with a hotplug writer. So, add a WARN_ON() in put_online_cpus() to detect such cases where the refcount can go negative, and also attempt to fix it up, so that we can continue to run. Signed-off-by: Srivatsa S. Bhat Reviewed-by: Yasuaki Ishimatsu Cc: Jiri Kosina Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f560598807c1..42bd331ee0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,6 +80,10 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; mutex_lock(&cpu_hotplug.lock); + + if (WARN_ON(!cpu_hotplug.refcount)) + cpu_hotplug.refcount++; /* try to fix things up */ + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); -- cgit v1.2.2 From 2dd8ad81e31d0d36a5d448329c646ab43eb17788 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:51 -0700 Subject: mm: use mm->exe_file instead of first VM_EXECUTABLE vma->vm_file Some security modules and oprofile still uses VM_EXECUTABLE for retrieving a task's executable file. After this patch they will use mm->exe_file directly. mm->exe_file is protected with mm->mmap_sem, so locking stays the same. Signed-off-by: Konstantin Khlebnikov Acked-by: Chris Metcalf [arch/tile] Acked-by: Tetsuo Handa [tomoyo] Cc: Alexander Viro Cc: Carsten Otte Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Acked-by: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 13 ++----------- kernel/fork.c | 3 +-- 2 files changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 29e090cc0e46..f4a7756f999c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1151,7 +1151,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) const struct cred *cred; char name[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; - struct vm_area_struct *vma; char *tty; if (!ab) @@ -1191,16 +1190,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) if (mm) { down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, " exe=", - &vma->vm_file->f_path); - break; - } - vma = vma->vm_next; - } + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); up_read(&mm->mmap_sem); } audit_log_task_context(ab); diff --git a/kernel/fork.c b/kernel/fork.c index a2b1efc20928..a57a993681ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -656,8 +656,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ + /* We need mmap_sem to protect against races with removal of exe_file */ down_read(&mm->mmap_sem); exe_file = mm->exe_file; if (exe_file) -- cgit v1.2.2 From e9714acf8c439688884234dcac2bfc38bb607d38 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:54 -0700 Subject: mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas Currently the kernel sets mm->exe_file during sys_execve() and then tracks number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon as this counter drops to zero kernel resets mm->exe_file to NULL. Plus it resets mm->exe_file at last mmput() when mm->mm_users drops to zero. VMA with VM_EXECUTABLE flag appears after mapping file with flag MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma splitting, because sys_mmap ignores this flag. Usually binfmt module sets mm->exe_file and mmaps executable vmas with this file, they hold mm->exe_file while task is running. comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"), where all this stuff was introduced: > The kernel implements readlink of /proc/pid/exe by getting the file from > the first executable VMA. Then the path to the file is reconstructed and > reported as the result. > > Because of the VMA walk the code is slightly different on nommu systems. > This patch avoids separate /proc/pid/exe code on nommu systems. Instead of > walking the VMAs to find the first executable file-backed VMA we store a > reference to the exec'd file in the mm_struct. > > That reference would prevent the filesystem holding the executable file > from being unmounted even after unmapping the VMAs. So we track the number > of VM_EXECUTABLE VMAs and drop the new reference when the last one is > unmapped. This avoids pinning the mounted filesystem. exe_file's vma accounting is hooked into every file mmap/unmmap and vma split/merge just to fix some hypothetical pinning fs from umounting by mm, which already unmapped all its executable files, but still alive. Seems like currently nobody depends on this behaviour. We can try to remove this logic and keep mm->exe_file until final mmput(). mm->exe_file is still protected with mm->mmap_sem, because we want to change it via new sys_prctl(PR_SET_MM_EXE_FILE). Also via this syscall task can change its mm->exe_file and unpin mountpoint explicitly. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a57a993681ed..ec667f797af3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -622,26 +622,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { if (new_exe_file) @@ -649,7 +629,6 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) if (mm->exe_file) fput(mm->exe_file); mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; } struct file *get_mm_exe_file(struct mm_struct *mm) -- cgit v1.2.2 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f16f3c58f11a..cda3ebd49e86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3671,7 +3671,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; -- cgit v1.2.2 From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Oct 2012 16:29:30 -0700 Subject: oom: remove deprecated oom_adj The deprecated /proc//oom_adj is scheduled for removal this month. Signed-off-by: Davidlohr Bueso Acked-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ec667f797af3..972762e01024 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1056,7 +1056,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_rwsem(&sig->group_rwsem); #endif - sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.2.2 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 +-- kernel/fork.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..1d9c0a985960 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -735,7 +735,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -744,7 +743,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; diff --git a/kernel/fork.c b/kernel/fork.c index 972762e01024..90dace52715e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + vma_interval_tree_add(tmp, mpnt, mapping); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } -- cgit v1.2.2 From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:35 -0700 Subject: mm: interval tree updates Update the generic interval tree code that was introduced in "mm: replace vma prio_tree with an interval tree". Changes: - fixed 'endpoing' typo noticed by Andrew Morton - replaced include/linux/interval_tree_tmpl.h, which was used as a template (including it automatically defined the interval tree functions) with include/linux/interval_tree_generic.h, which only defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself defines the interval tree functions when invoked. Now that is a very long macro which is unfortunate, but it does make the usage sites (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously. - make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro, instead of duplicating that code in the interval tree template. - replaced vma_interval_tree_add(), which was actually handling the nonlinear and interval tree cases, with vma_interval_tree_insert_after() which handles only the interval tree case and has an API that is more consistent with the other interval tree handling functions. The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 90dace52715e..1cd7d581b3b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_add(tmp, mpnt, mapping); + if (unlikely(tmp->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(tmp, + &mapping->i_mmap_nonlinear); + else + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } -- cgit v1.2.2 From 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 8 Oct 2012 16:33:35 -0700 Subject: mm: wrap calls to set_pte_at_notify with invalidate_range_start and invalidate_range_end In order to allow sleeping during invalidate_page mmu notifier calls, we need to avoid calling when holding the PT lock. In addition to its direct calls, invalidate_page can also be called as a substitute for a change_pte call, in case the notifier client hasn't implemented change_pte. This patch drops the invalidate_page call from change_pte, and instead wraps all calls to change_pte with invalidate_range_start and invalidate_range_end calls. Note that change_pte still cannot sleep after this patch, and that clients implementing change_pte should not take action on it in case the number of outstanding invalidate_range_start calls is larger than one, otherwise they might miss a later invalidation. Signed-off-by: Haggai Eran Cc: Andrea Arcangeli Cc: Sagi Grimberg Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1d9c0a985960..98256bc71ee1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -141,10 +141,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; pte_t *ptep; int err; + /* For mmu_notifiers */ + const unsigned long mmun_start = addr; + const unsigned long mmun_end = addr + PAGE_SIZE; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) @@ -173,6 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; } -- cgit v1.2.2 From 26cff4e2aa4d666dc6a120ea34336b5057e3e187 Mon Sep 17 00:00:00 2001 From: "Hildner, Christian" Date: Mon, 8 Oct 2012 15:49:03 +0200 Subject: timers: Fix endless looping between cascade() and internal_add_timer() Adding two (or more) timers with large values for "expires" (they have to reside within tv5 in the same list) leads to endless looping between cascade() and internal_add_timer() in case CONFIG_BASE_SMALL is one and jiffies are crossing the value 1 << 18. The bug was introduced between 2.6.11 and 2.6.12 (and survived for quite some time). This patch ensures that when cascade() is called timers within tv5 are not added endlessly to their own list again, instead they are added to the next lower tv level tv4 (as expected). Signed-off-by: Christian Hildner Reviewed-by: Jan Kiszka Link: http://lkml.kernel.org/r/98673C87CB31274881CFFE0B65ECC87B0F5FC1963E@DEFTHW99EA4MSX.ww902.siemens.net Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/timer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index d5de1b2292aa..367d00858482 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { struct list_head vec[TVN_SIZE]; @@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; + if (idx > MAX_TVAL) { + idx = MAX_TVAL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; -- cgit v1.2.2 From 5b3900cd409466c0070b234d941650685ad0c791 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 9 Oct 2012 10:18:23 +0300 Subject: timekeeping: Cast raw_interval to u64 to avoid shift overflow We fixed a bunch of integer overflows in timekeeping code during the 3.6 cycle. I did an audit based on that and found this potential overflow. Signed-off-by: Dan Carpenter Acked-by: John Stultz Link: http://lkml.kernel.org/r/20121009071823.GA19159@elgon.mountain Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 16280ff3cf82..3eb3fc7c1600 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1045,7 +1045,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = tk->raw_interval << shift; + raw_nsecs = (u64)tk->raw_interval << shift; raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; -- cgit v1.2.2 From d1c7d97ad58836affde6e39980b96527510b572e Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 4 Oct 2012 19:57:31 -0400 Subject: fs: handle failed audit_log_start properly audit_log_start() may return NULL, this is unchecked by the caller in audit_log_link_denied() and could cause a NULL ptr deref. Introduced by commit a51d9eaa ("fs: add link restriction audit reporting"). Signed-off-by: Sasha Levin Signed-off-by: Al Viro --- kernel/audit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 4d0ceede3319..40414e9143db 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1440,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link) ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); + if (!ab) + return; audit_log_format(ab, "op=%s action=denied", operation); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); -- cgit v1.2.2 From 2854d167cc545d0642277bf8b77f972a91146fc6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 27 Sep 2012 14:59:39 +0200 Subject: irqdomain: augment add_simple() to allocate descs Currently we rely on all IRQ chip instances to dynamically allocate their IRQ descriptors unless they use the linear IRQ domain. So for irqdomain_add_legacy() and irqdomain_add_simple() the caller need to make sure that descriptors are allocated. Let's slightly augment the yet unused irqdomain_add_simple() to also allocate descriptors as a means to simplify usage and avoid code duplication throughout the kernel. We warn if descriptors cannot be allocated, e.g. if a platform has the bad habit of hogging descriptors at boot time. Cc: Thomas Gleixner Cc: Grant Likely Cc: Paul Mundt Cc: Russell King Cc: Lee Jones Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- kernel/irq/irqdomain.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db42..4e69e24d3d7d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * @host_data: Controller private data pointer * * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated. * * This is intended to implement the expected behaviour for most * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) - return irq_domain_add_legacy(of_node, size, first_irq, 0, + if (first_irq > 0) { + int irq_base; + + if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { + /* + * Set the descriptor allocator to search for a + * 1-to-1 mapping, such as irq_alloc_desc_at(). + * Use of_node_to_nid() which is defined to + * numa_node_id() on platforms that have no custom + * implementation. + */ + irq_base = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (irq_base < 0) { + WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + first_irq); + irq_base = first_irq; + } + } else + irq_base = first_irq; + + return irq_domain_add_legacy(of_node, size, irq_base, 0, ops, host_data); - else - return irq_domain_add_linear(of_node, size, ops, host_data); + } + + /* A linear domain is the default */ + return irq_domain_add_linear(of_node, size, ops, host_data); } /** -- cgit v1.2.2 From 106a4ee258d14818467829bf0e12aeae14c16cd7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 26 Sep 2012 10:09:40 +0100 Subject: module: signature checking hook We do a very simple search for a particular string appended to the module (which is cache-hot and about to be SHA'd anyway). There's both a config option and a boot parameter which control whether we accept or fail with unsigned modules and modules that are signed with an unknown key. If module signing is enabled, the kernel will be tainted if a module is loaded that is unsigned or has a signature for which we don't have the key. (Useful feedback and tweaks by David Howells ) Signed-off-by: Rusty Russell Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/Makefile | 1 + kernel/module-internal.h | 13 +++++++ kernel/module.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/module_signing.c | 23 ++++++++++++ 4 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 kernel/module-internal.h create mode 100644 kernel/module_signing.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..08ba8a6abd1c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o diff --git a/kernel/module-internal.h b/kernel/module-internal.h new file mode 100644 index 000000000000..033c17fd70ef --- /dev/null +++ b/kernel/module-internal.h @@ -0,0 +1,13 @@ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +extern int mod_verify_sig(const void *mod, unsigned long modlen, + const void *sig, unsigned long siglen); diff --git a/kernel/module.c b/kernel/module.c index 74bc19562ca3..68c564edb2c1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -58,6 +58,7 @@ #include #include #include +#include "module-internal.h" #define CREATE_TRACE_POINTS #include @@ -102,6 +103,43 @@ static LIST_HEAD(modules); struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ #endif /* CONFIG_KGDB_KDB */ +#ifdef CONFIG_MODULE_SIG +#ifdef CONFIG_MODULE_SIG_FORCE +static bool sig_enforce = true; +#else +static bool sig_enforce = false; + +static int param_set_bool_enable_only(const char *val, + const struct kernel_param *kp) +{ + int err; + bool test; + struct kernel_param dummy_kp = *kp; + + dummy_kp.arg = &test; + + err = param_set_bool(val, &dummy_kp); + if (err) + return err; + + /* Don't let them unset it once it's set! */ + if (!test && sig_enforce) + return -EROFS; + + if (test) + sig_enforce = true; + return 0; +} + +static const struct kernel_param_ops param_ops_bool_enable_only = { + .set = param_set_bool_enable_only, + .get = param_get_bool, +}; +#define param_check_bool_enable_only param_check_bool + +module_param(sig_enforce, bool_enable_only, 0644); +#endif /* !CONFIG_MODULE_SIG_FORCE */ +#endif /* CONFIG_MODULE_SIG */ /* Block module loading/unloading? */ int modules_disabled = 0; @@ -136,6 +174,7 @@ struct load_info { unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; + bool sig_ok; struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -2379,7 +2418,49 @@ static inline void kmemleak_load_module(const struct module *mod, } #endif -/* Sets info->hdr and info->len. */ +#ifdef CONFIG_MODULE_SIG +static int module_sig_check(struct load_info *info, + const void *mod, unsigned long *len) +{ + int err = -ENOKEY; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const void *p = mod, *end = mod + *len; + + /* Poor man's memmem. */ + while ((p = memchr(p, MODULE_SIG_STRING[0], end - p))) { + if (p + markerlen > end) + break; + + if (memcmp(p, MODULE_SIG_STRING, markerlen) == 0) { + const void *sig = p + markerlen; + /* Truncate module up to signature. */ + *len = p - mod; + err = mod_verify_sig(mod, *len, sig, end - sig); + break; + } + p++; + } + + if (!err) { + info->sig_ok = true; + return 0; + } + + /* Not having a signature is only an error if we're strict. */ + if (err == -ENOKEY && !sig_enforce) + err = 0; + + return err; +} +#else /* !CONFIG_MODULE_SIG */ +static int module_sig_check(struct load_info *info, + void *mod, unsigned long *len) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +/* Sets info->hdr, info->len and info->sig_ok. */ static int copy_and_check(struct load_info *info, const void __user *umod, unsigned long len, const char __user *uargs) @@ -2399,6 +2480,10 @@ static int copy_and_check(struct load_info *info, goto free_hdr; } + err = module_sig_check(info, hdr, &len); + if (err) + goto free_hdr; + /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 @@ -2884,6 +2969,12 @@ static struct module *load_module(void __user *umod, goto free_copy; } +#ifdef CONFIG_MODULE_SIG + mod->sig_ok = info.sig_ok; + if (!mod->sig_ok) + add_taint_module(mod, TAINT_FORCED_MODULE); +#endif + /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) diff --git a/kernel/module_signing.c b/kernel/module_signing.c new file mode 100644 index 000000000000..499728aecafb --- /dev/null +++ b/kernel/module_signing.c @@ -0,0 +1,23 @@ +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include "module-internal.h" + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, unsigned long modlen, + const void *sig, unsigned long siglen) +{ + return -ENOKEY; +} -- cgit v1.2.2 From 1d0059f3a468825b5fc5405c636a2f6e02707ffa Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Sep 2012 10:09:50 +0100 Subject: MODSIGN: Add FIPS policy If we're in FIPS mode, we should panic if we fail to verify the signature on a module or we're asked to load an unsigned module in signature enforcing mode. Possibly FIPS mode should automatically enable enforcing mode. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/module.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 68c564edb2c1..0e2da8695f8e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "module-internal.h" #define CREATE_TRACE_POINTS @@ -2447,6 +2448,9 @@ static int module_sig_check(struct load_info *info, } /* Not having a signature is only an error if we're strict. */ + if (err < 0 && fips_enabled) + panic("Module verification failed with error %d in FIPS mode\n", + err); if (err == -ENOKEY && !sig_enforce) err = 0; -- cgit v1.2.2 From d441108c6f77541bb66fcd5b3389415b4c232008 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Sep 2012 10:09:51 +0100 Subject: MODSIGN: Automatically generate module signing keys if missing Automatically generate keys for module signing if they're absent so that allyesconfig doesn't break. The builder should consider generating their own key and certificate, however, so that the keys are appropriately named. The private key for the module signer should be placed in signing_key.priv (unencrypted!) and the public key in an X.509 certificate as signing_key.x509. If a transient key is desired for signing the modules, a config file for 'openssl req' can be placed in x509.genkey, looking something like the following: [ req ] default_bits = 4096 distinguished_name = req_distinguished_name prompt = no x509_extensions = myexts [ req_distinguished_name ] O = Magarathea CN = Glacier signing key emailAddress = slartibartfast@magrathea.h2g2 [ myexts ] basicConstraints=critical,CA:FALSE keyUsage=digitalSignature subjectKeyIdentifier=hash authorityKeyIdentifier=hash The build process will use this to configure: openssl req -new -nodes -utf8 -sha1 -days 36500 -batch \ -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ -keyout signing_key.priv to generate the key. Note that it is required that the X.509 certificate have a subjectKeyIdentifier and an authorityKeyIdentifier. Without those, the certificate will be rejected. These can be used to check the validity of a certificate. Note that 'make distclean' will remove signing_key.{priv,x509} and x509.genkey, whether or not they were generated automatically. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/Makefile | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 08ba8a6abd1c..58c6f111267e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -132,3 +132,52 @@ quiet_cmd_timeconst = TIMEC $@ targets += timeconst.h $(obj)/timeconst.h: $(src)/timeconst.pl FORCE $(call if_changed,timeconst) + +ifeq ($(CONFIG_MODULE_SIG),y) + +############################################################################### +# +# If module signing is requested, say by allyesconfig, but a key has not been +# supplied, then one will need to be generated to make sure the build does not +# fail and that the kernel may be used afterwards. +# +############################################################################### +signing_key.priv signing_key.x509: x509.genkey + @echo "###" + @echo "### Now generating an X.509 key pair to be used for signing modules." + @echo "###" + @echo "### If this takes a long time, you might wish to run rngd in the" + @echo "### background to keep the supply of entropy topped up. It" + @echo "### needs to be run as root, and should use a hardware random" + @echo "### number generator if one is available, eg:" + @echo "###" + @echo "### rngd -r /dev/hwrandom" + @echo "###" + openssl req -new -nodes -utf8 -sha1 -days 36500 -batch \ + -x509 -config x509.genkey \ + -outform DER -out signing_key.x509 \ + -keyout signing_key.priv + @echo "###" + @echo "### Key pair generated." + @echo "###" + +x509.genkey: + @echo Generating X.509 key generation config + @echo >x509.genkey "[ req ]" + @echo >>x509.genkey "default_bits = 4096" + @echo >>x509.genkey "distinguished_name = req_distinguished_name" + @echo >>x509.genkey "prompt = no" + @echo >>x509.genkey "x509_extensions = myexts" + @echo >>x509.genkey + @echo >>x509.genkey "[ req_distinguished_name ]" + @echo >>x509.genkey "O = Magrathea" + @echo >>x509.genkey "CN = Glacier signing key" + @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" + @echo >>x509.genkey + @echo >>x509.genkey "[ myexts ]" + @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" + @echo >>x509.genkey "keyUsage=digitalSignature" + @echo >>x509.genkey "subjectKeyIdentifier=hash" + @echo >>x509.genkey "authorityKeyIdentifier=keyid" +endif +CLEAN_FILES += signing_key.priv signing_key.x509 x509.genkey -- cgit v1.2.2 From 631cc66eb9eaa7296e303197ff1eb0f55e32b61d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Sep 2012 10:09:51 +0100 Subject: MODSIGN: Provide module signing public keys to the kernel Include a PGP keyring containing the public keys required to perform module verification in the kernel image during build and create a special keyring during boot which is then populated with keys of crypto type holding the public keys found in the PGP keyring. These can be seen by root: [root@andromeda ~]# cat /proc/keys 07ad4ee0 I----- 1 perm 3f010000 0 0 crypto modsign.0: RSA 87b9b3bd [] 15c7f8c3 I----- 1 perm 1f030000 0 0 keyring .module_sign: 1/4 ... It is probably worth permitting root to invalidate these keys, resulting in their removal and preventing further modules from being loaded with that key. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/Makefile | 11 ++++- kernel/modsign_pubkey.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/module-internal.h | 2 + 3 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 kernel/modsign_pubkey.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 58c6f111267e..111a845460c9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -55,7 +55,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -134,6 +134,13 @@ $(obj)/timeconst.h: $(src)/timeconst.pl FORCE $(call if_changed,timeconst) ifeq ($(CONFIG_MODULE_SIG),y) +# +# Pull the signing certificate and any extra certificates into the kernel +# +extra_certificates: + touch $@ + +kernel/modsign_pubkey.o: signing_key.x509 extra_certificates ############################################################################### # @@ -180,4 +187,4 @@ x509.genkey: @echo >>x509.genkey "subjectKeyIdentifier=hash" @echo >>x509.genkey "authorityKeyIdentifier=keyid" endif -CLEAN_FILES += signing_key.priv signing_key.x509 x509.genkey +CLEAN_FILES += signing_key.priv signing_key.x509 x509.genkey extra_certificates diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 000000000000..4646eb2c3820 --- /dev/null +++ b/kernel/modsign_pubkey.c @@ -0,0 +1,113 @@ +/* Public keys for module signature verification + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "module-internal.h" + +struct key *modsign_keyring; + +extern __initdata const u8 modsign_certificate_list[]; +extern __initdata const u8 modsign_certificate_list_end[]; +asm(".section .init.data,\"aw\"\n" + "modsign_certificate_list:\n" + ".incbin \"signing_key.x509\"\n" + ".incbin \"extra_certificates\"\n" + "modsign_certificate_list_end:" + ); + +/* + * We need to make sure ccache doesn't cache the .o file as it doesn't notice + * if modsign.pub changes. + */ +static __initdata const char annoy_ccache[] = __TIME__ "foo"; + +/* + * Load the compiled-in keys + */ +static __init int module_verify_init(void) +{ + pr_notice("Initialise module verification\n"); + + modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(modsign_keyring)) + panic("Can't allocate module signing keyring\n"); + + if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) + panic("Can't instantiate module signing keyring\n"); + + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(module_verify_init); + +/* + * Load the compiled-in keys + */ +static __init int load_module_signing_keys(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading module verification certificates\n"); + + end = modsign_certificate_list_end; + p = modsign_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(modsign_keyring, 1), + "asymmetric", + NULL, + p, + plen, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) + pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + else + pr_notice("MODSIGN: Loaded cert '%s'\n", + key_ref_to_ptr(key)->description); + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 033c17fd70ef..6114a13419bd 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,5 +9,7 @@ * 2 of the Licence, or (at your option) any later version. */ +extern struct key *modsign_keyring; + extern int mod_verify_sig(const void *mod, unsigned long modlen, const void *sig, unsigned long siglen); -- cgit v1.2.2 From 48ba2462ace6072741fd8d0058207d630ce93bf1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Sep 2012 10:11:03 +0100 Subject: MODSIGN: Implement module signature checking Check the signature on the module against the keys compiled into the kernel or available in a hardware key store. Currently, only RSA keys are supported - though that's easy enough to change, and the signature is expected to contain raw components (so not a PGP or PKCS#7 formatted blob). The signature blob is expected to consist of the following pieces in order: (1) The binary identifier for the key. This is expected to match the SubjectKeyIdentifier from an X.509 certificate. Only X.509 type identifiers are currently supported. (2) The signature data, consisting of a series of MPIs in which each is in the format of a 2-byte BE word sizes followed by the content data. (3) A 12 byte information block of the form: struct module_signature { enum pkey_algo algo : 8; enum pkey_hash_algo hash : 8; enum pkey_id_type id_type : 8; u8 __pad; __be32 id_length; __be32 sig_length; }; The three enums are defined in crypto/public_key.h. 'algo' contains the public-key algorithm identifier (0->DSA, 1->RSA). 'hash' contains the digest algorithm identifier (0->MD4, 1->MD5, 2->SHA1, etc.). 'id_type' contains the public-key identifier type (0->PGP, 1->X.509). '__pad' should be 0. 'id_length' should contain in the binary identifier length in BE form. 'sig_length' should contain in the signature data length in BE form. The lengths are in BE order rather than CPU order to make dealing with cross-compilation easier. Signed-off-by: David Howells Signed-off-by: Rusty Russell (minor Kconfig fix) --- kernel/module_signing.c | 222 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 221 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 499728aecafb..6b09f6983ac0 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -11,13 +11,233 @@ #include #include +#include +#include +#include #include "module-internal.h" +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + enum pkey_algo algo : 8; /* Public-key crypto algorithm */ + enum pkey_hash_algo hash : 8; /* Digest algorithm */ + enum pkey_id_type id_type : 8; /* Key identifier type */ + u8 signer_len; /* Length of signer's name */ + u8 key_id_len; /* Length of key identifier */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +/* + * Digest the module contents. + */ +static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, + const void *mod, + unsigned long modlen) +{ + struct public_key_signature *pks; + struct crypto_shash *tfm; + struct shash_desc *desc; + size_t digest_size, desc_size; + int ret; + + pr_devel("==>%s()\n", __func__); + + /* Allocate the hashing algorithm we're going to need and find out how + * big the hash operational data will be. + */ + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + if (IS_ERR(tfm)) + return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + digest_size = crypto_shash_digestsize(tfm); + + /* We allocate the hash operational data storage on the end of our + * context data and the digest output buffer on the end of that. + */ + ret = -ENOMEM; + pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); + if (!pks) + goto error_no_pks; + + pks->pkey_hash_algo = hash; + pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; + pks->digest_size = digest_size; + + desc = (void *)pks + sizeof(*pks); + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto error; + + ret = crypto_shash_finup(desc, mod, modlen, pks->digest); + if (ret < 0) + goto error; + + crypto_free_shash(tfm); + pr_devel("<==%s() = ok\n", __func__); + return pks; + +error: + kfree(pks); +error_no_pks: + crypto_free_shash(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ERR_PTR(ret); +} + +/* + * Extract an MPI array from the signature data. This represents the actual + * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the + * size of the MPI in bytes. + * + * RSA signatures only have one MPI, so currently we only read one. + */ +static int mod_extract_mpi_array(struct public_key_signature *pks, + const void *data, size_t len) +{ + size_t nbytes; + MPI mpi; + + if (len < 3) + return -EBADMSG; + nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; + data += 2; + len -= 2; + if (len != nbytes) + return -EBADMSG; + + mpi = mpi_read_raw_data(data, nbytes); + if (!mpi) + return -ENOMEM; + pks->mpi[0] = mpi; + pks->nr_mpi = 1; + return 0; +} + +/* + * Request an asymmetric key. + */ +static struct key *request_asymmetric_key(const char *signer, size_t signer_len, + const u8 *key_id, size_t key_id_len) +{ + key_ref_t key; + size_t i; + char *id, *q; + + pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); + + /* Construct an identifier. */ + id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); + if (!id) + return ERR_PTR(-ENOKEY); + + memcpy(id, signer, signer_len); + + q = id + signer_len; + *q++ = ':'; + *q++ = ' '; + for (i = 0; i < key_id_len; i++) { + *q++ = hex_asc[*key_id >> 4]; + *q++ = hex_asc[*key_id++ & 0x0f]; + } + + *q = 0; + + pr_debug("Look up: \"%s\"\n", id); + + key = keyring_search(make_key_ref(modsign_keyring, 1), + &key_type_asymmetric, id); + if (IS_ERR(key)) + pr_warn("Request for unknown module key '%s' err %ld\n", + id, PTR_ERR(key)); + kfree(id); + + if (IS_ERR(key)) { + switch (PTR_ERR(key)) { + /* Hide some search errors */ + case -EACCES: + case -ENOTDIR: + case -EAGAIN: + return ERR_PTR(-ENOKEY); + default: + return ERR_CAST(key); + } + } + + pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); + return key_ref_to_ptr(key); +} + /* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, unsigned long modlen, const void *sig, unsigned long siglen) { - return -ENOKEY; + struct public_key_signature *pks; + struct module_signature ms; + struct key *key; + size_t sig_len; + int ret; + + pr_devel("==>%s(,%lu,,%lu,)\n", __func__, modlen, siglen); + + if (siglen <= sizeof(ms)) + return -EBADMSG; + + memcpy(&ms, sig + (siglen - sizeof(ms)), sizeof(ms)); + siglen -= sizeof(ms); + + sig_len = be32_to_cpu(ms.sig_len); + if (sig_len >= siglen || + siglen - sig_len != (size_t)ms.signer_len + ms.key_id_len) + return -EBADMSG; + + /* For the moment, only support RSA and X.509 identifiers */ + if (ms.algo != PKEY_ALGO_RSA || + ms.id_type != PKEY_ID_X509) + return -ENOPKG; + + if (ms.hash >= PKEY_HASH__LAST || + !pkey_hash_algo[ms.hash]) + return -ENOPKG; + + key = request_asymmetric_key(sig, ms.signer_len, + sig + ms.signer_len, ms.key_id_len); + if (IS_ERR(key)) + return PTR_ERR(key); + + pks = mod_make_digest(ms.hash, mod, modlen); + if (IS_ERR(pks)) { + ret = PTR_ERR(pks); + goto error_put_key; + } + + ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, + sig_len); + if (ret < 0) + goto error_free_pks; + + ret = verify_signature(key, pks); + pr_devel("verify_signature() = %d\n", ret); + +error_free_pks: + mpi_free(pks->rsa.s); + kfree(pks); +error_put_key: + key_put(key); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; } -- cgit v1.2.2 From 5e8cb1e441dd74723898cd28fe64af5651023af0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 28 Sep 2012 11:16:57 +0100 Subject: MODSIGN: Use the same digest for the autogen key sig as for the module sig Use the same digest type for the autogenerated key signature as for the module signature so that the hash algorithm is guaranteed to be present in the kernel. Without this, the X.509 certificate loader may reject the X.509 certificate so generated because it was self-signed and the signature will be checked against itself - but this won't work if the digest algorithm must be loaded as a module. The symptom is that the key fails to load with the following message emitted into the kernel log: MODSIGN: Problem loading in-kernel X.509 certificate (-65) the error in brackets being -ENOPKG. What you should see is something like: MODSIGN: Loaded cert 'Magarathea: Glacier signing key: 9588321144239a119d3406d4c4cf1fbae1836fa0' Note that this doesn't apply to certificates that are not self-signed as we don't check those currently as they require the parent CA certificate to be available. Reported-by: Rusty Russell Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/Makefile | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 111a845460c9..a799029320d1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -149,6 +149,26 @@ kernel/modsign_pubkey.o: signing_key.x509 extra_certificates # fail and that the kernel may be used afterwards. # ############################################################################### +sign_key_with_hash := +ifeq ($(CONFIG_MODULE_SIG_SHA1),y) +sign_key_with_hash := -sha1 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA224),y) +sign_key_with_hash := -sha224 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA256),y) +sign_key_with_hash := -sha256 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA384),y) +sign_key_with_hash := -sha384 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA512),y) +sign_key_with_hash := -sha512 +endif +ifeq ($(sign_key_with_hash),) +$(error Could not determine digest type to use from kernel config) +endif + signing_key.priv signing_key.x509: x509.genkey @echo "###" @echo "### Now generating an X.509 key pair to be used for signing modules." @@ -160,7 +180,7 @@ signing_key.priv signing_key.x509: x509.genkey @echo "###" @echo "### rngd -r /dev/hwrandom" @echo "###" - openssl req -new -nodes -utf8 -sha1 -days 36500 -batch \ + openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ -keyout signing_key.priv -- cgit v1.2.2 From e7d113bcf243a838ba1c32025172ab214349dfad Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 28 Sep 2012 11:16:57 +0100 Subject: MODSIGN: Use utf8 strings in signer's name in autogenerated X.509 certs Place an indication that the certificate should use utf8 strings into the x509.genkey template generated by kernel/Makefile. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index a799029320d1..e951adf93567 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -194,6 +194,7 @@ x509.genkey: @echo >>x509.genkey "default_bits = 4096" @echo >>x509.genkey "distinguished_name = req_distinguished_name" @echo >>x509.genkey "prompt = no" + @echo >>x509.genkey "string_mask = utf8only" @echo >>x509.genkey "x509_extensions = myexts" @echo >>x509.genkey @echo >>x509.genkey "[ req_distinguished_name ]" -- cgit v1.2.2 From d5b719365ec13ef825f2548ba54903b9d029238c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 2 Oct 2012 14:35:24 +0930 Subject: MODSIGN: Make mrproper should remove generated files. It doesn't, because the clean targets don't include kernel/Makefile, and because two files were missing from the list. Signed-off-by: Rusty Russell --- kernel/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index e951adf93567..d3611c8a6b8d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -208,4 +208,3 @@ x509.genkey: @echo >>x509.genkey "subjectKeyIdentifier=hash" @echo >>x509.genkey "authorityKeyIdentifier=keyid" endif -CLEAN_FILES += signing_key.priv signing_key.x509 x509.genkey extra_certificates -- cgit v1.2.2 From 8e49f418c9632790bf456634742d34d97120a784 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Wed, 10 Oct 2012 16:40:27 -0700 Subject: ring-buffer: Check for uninitialized cpu buffer before resizing With a system where, num_present_cpus < num_possible_cpus, even if all CPUs are online, non-present CPUs don't have per_cpu buffers allocated. If per_cpu//buffer_size_kb is modified for such a CPU, it can cause a panic due to NULL dereference in ring_buffer_resize(). To fix this, resize operation is allowed only if the per-cpu buffer has been initialized. Link: http://lkml.kernel.org/r/1349912427-6486-1-git-send-email-vnagarnaik@google.com Cc: stable@vger.kernel.org # 3.5+ Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b32ed0e385a5..b979426d16c6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1567,6 +1567,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, put_online_cpus(); } else { + /* Make sure this CPU has been intitialized */ + if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) + goto out; + cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) -- cgit v1.2.2 From fb45550d76bb584857cf0ea3be79fa78207a3cff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Oct 2012 20:09:44 -0400 Subject: make sure that kernel_thread() callbacks call do_exit() themselves Most of them never returned anyway - only two functions had to be changed. That allows to simplify their callers a whole lot. Note that this does *not* apply to kthread_run() callbacks - all of those had been called from the same kernel_thread() callback, which did do_exit() already. This is strictly about very few low-level kernel_thread() callbacks (there are only 6 of those, mostly as part of kthread.h and kmod.h exported mechanisms, plus kernel_init() itself). Signed-off-by: Al Viro --- kernel/kmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c6..b6e5ca9c758a 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -225,7 +225,7 @@ static int ____call_usermodehelper(void *data) /* Exec failed? */ fail: sub_info->retval = retval; - return 0; + do_exit(0); } static int call_helper(void *data) @@ -292,7 +292,7 @@ static int wait_for_helper(void *data) } umh_complete(sub_info); - return 0; + do_exit(0); } /* This is run by khelper thread */ -- cgit v1.2.2 From 1c2e51e8c162417d2831007ec256ede06c3a0201 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:20 -0400 Subject: audit: pass in dentry to audit_copy_inode wherever possible In some cases, we were passing in NULL even when we have a dentry. Reported-by: Eric Paris Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f4a7756f999c..4d1bd62b090b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2212,7 +2212,7 @@ void __audit_inode_child(const struct dentry *dentry, if (!strcmp(dname, n->name) || !audit_compare_dname_path(dname, n->name, &dirlen)) { if (inode) - audit_copy_inode(n, NULL, inode); + audit_copy_inode(n, dentry, inode); else n->ino = (unsigned long)-1; found_child = n->name; @@ -2244,7 +2244,7 @@ add_names: } if (inode) - audit_copy_inode(n, NULL, inode); + audit_copy_inode(n, dentry, inode); } } EXPORT_SYMBOL_GPL(__audit_inode_child); -- cgit v1.2.2 From 9cec9d68ae53aae60b4a1fca4505c75a1d026392 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:21 -0400 Subject: audit: no need to walk list in audit_inode if name is NULL If name is NULL then the condition in the loop will never be true. Also, with this change, we can eliminate the check for n->name == NULL since the equivalence check will never be true if it is. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4d1bd62b090b..2e481141b014 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2147,11 +2147,15 @@ void __audit_inode(const char *name, const struct dentry *dentry) if (!context->in_syscall) return; + if (!name) + goto out_alloc; + list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name && (n->name == name)) + if (n->name == name) goto out; } +out_alloc: /* unable to find the name from a previous getname() */ n = audit_alloc_name(context); if (!n) -- cgit v1.2.2 From c43a25abba97c7d87131e71db6be24b24d7791a5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:21 -0400 Subject: audit: reverse arguments to audit_inode_child Most of the callers get called with an inode and dentry in the reverse order. The compiler then has to reshuffle the arg registers and/or stack in order to pass them on to audit_inode_child. Reverse those arguments for a micro-optimization. Reported-by: Eric Paris Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2e481141b014..40743af02d8f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2166,9 +2166,9 @@ out: } /** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited + * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent + * @dentry: dentry being audited * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -2178,8 +2178,8 @@ out: * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct dentry *dentry, - const struct inode *parent) +void __audit_inode_child(const struct inode *parent, + const struct dentry *dentry) { struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; -- cgit v1.2.2 From 78e2e802a8519031e5858595070b39713e26340d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:22 -0400 Subject: audit: add a new "type" field to audit_names struct For now, we just have two possibilities: UNKNOWN: for a new audit_names record that we don't know anything about yet NORMAL: for everything else In later patches, we'll add other types so we can distinguish and update records created under different circumstances. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 40743af02d8f..19b232f86d70 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -120,6 +120,7 @@ struct audit_names { struct audit_cap_data fcap; unsigned int fcap_ver; int name_len; /* number of name's characters to log */ + unsigned char type; /* record type */ bool name_put; /* call __putname() for this name */ /* * This was an allocated audit_names and not from the array of @@ -1995,7 +1996,8 @@ retry: #endif } -static struct audit_names *audit_alloc_name(struct audit_context *context) +static struct audit_names *audit_alloc_name(struct audit_context *context, + unsigned char type) { struct audit_names *aname; @@ -2010,6 +2012,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } aname->ino = (unsigned long)-1; + aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; @@ -2040,7 +2043,7 @@ void __audit_getname(const char *name) return; } - n = audit_alloc_name(context); + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; @@ -2157,12 +2160,13 @@ void __audit_inode(const char *name, const struct dentry *dentry) out_alloc: /* unable to find the name from a previous getname() */ - n = audit_alloc_name(context); + n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; out: handle_path(dentry); audit_copy_inode(n, dentry, inode); + n->type = AUDIT_TYPE_NORMAL; } /** @@ -2219,6 +2223,7 @@ void __audit_inode_child(const struct inode *parent, audit_copy_inode(n, dentry, inode); else n->ino = (unsigned long)-1; + n->type = AUDIT_TYPE_NORMAL; found_child = n->name; goto add_names; } @@ -2226,14 +2231,14 @@ void __audit_inode_child(const struct inode *parent, add_names: if (!found_parent) { - n = audit_alloc_name(context); + n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; audit_copy_inode(n, NULL, parent); } if (!found_child) { - n = audit_alloc_name(context); + n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; -- cgit v1.2.2 From bfcec7087458812f575d9022b2d151641f34ee84 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:23 -0400 Subject: audit: set the name_len in audit_inode for parent lookups Currently, this gets set mostly by happenstance when we call into audit_inode_child. While that might be a little more efficient, it seems wrong. If the syscall ends up failing before audit_inode_child ever gets called, then you'll have an audit_names record that shows the full path but has the parent inode info attached. Fix this by passing in a parent flag when we call audit_inode that gets set to the value of LOOKUP_PARENT. We can then fix up the pathname for the audit entry correctly from the get-go. While we're at it, clean up the no-op macro for audit_inode in the !CONFIG_AUDITSYSCALL case. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/audit.h | 1 + kernel/auditfilter.c | 30 ++++++++++++++++++++++++++++++ kernel/auditsc.c | 41 +++++++++++++++++++++++++++++------------ 3 files changed, 60 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 9eb3d79482b6..163b9a5d9441 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -78,6 +78,7 @@ extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); +extern int parent_len(const char *path); extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c4bcdbaf4d4d..71bb13598df3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1298,6 +1298,36 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) } } +/** + * parent_len - find the length of the parent portion of a pathname + * @path: pathname of which to determine length + */ +int parent_len(const char *path) +{ + int plen; + const char *p; + + plen = strlen(path); + + if (plen == 0) + return plen; + + /* disregard trailing slashes */ + p = path + plen - 1; + while ((*p == '/') && (p > path)) + p--; + + /* walk backward until we find the next slash or hit beginning */ + while ((*p != '/') && (p > path)) + p--; + + /* did we find a slash? Then increment to include it in path */ + if (*p == '/') + p++; + + return p - path; +} + /* Compare given dentry name with last component in given path, * return of 0 indicates a match. */ int audit_compare_dname_path(const char *dname, const char *path, diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 19b232f86d70..b87b28947acc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2135,13 +2135,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent } /** - * audit_inode - store the inode and device from a lookup + * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). + * @parent: does this dentry represent the parent? */ -void __audit_inode(const char *name, const struct dentry *dentry) +void __audit_inode(const char *name, const struct dentry *dentry, + unsigned int parent) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; @@ -2154,19 +2154,38 @@ void __audit_inode(const char *name, const struct dentry *dentry) goto out_alloc; list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name == name) - goto out; + /* does the name pointer match? */ + if (n->name != name) + continue; + + /* match the correct record type */ + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } } out_alloc: - /* unable to find the name from a previous getname() */ + /* unable to find the name from a previous getname(). Allocate a new + * anonymous entry. + */ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; out: + if (parent) { + n->name_len = n->name ? parent_len(n->name) : AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_PARENT; + } else { + n->name_len = AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_NORMAL; + } handle_path(dentry); audit_copy_inode(n, dentry, inode); - n->type = AUDIT_TYPE_NORMAL; } /** @@ -2190,7 +2209,6 @@ void __audit_inode_child(const struct inode *parent, const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; struct audit_names *n; - int dirlen = 0; if (!context->in_syscall) return; @@ -2204,8 +2222,7 @@ void __audit_inode_child(const struct inode *parent, continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, &dirlen)) { - n->name_len = dirlen; /* update parent data in place */ + !audit_compare_dname_path(dname, n->name, NULL)) { found_parent = n->name; goto add_names; } @@ -2218,7 +2235,7 @@ void __audit_inode_child(const struct inode *parent, /* strcmp() is the more likely scenario */ if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, &dirlen)) { + !audit_compare_dname_path(dname, n->name, NULL)) { if (inode) audit_copy_inode(n, dentry, inode); else -- cgit v1.2.2 From 563a0d1236c2c58d584ef122a5cdc9930e5860b3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:24 -0400 Subject: audit: remove dirlen argument to audit_compare_dname_path All the callers set this to NULL now. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/audit.h | 3 +-- kernel/audit_watch.c | 2 +- kernel/auditfilter.c | 6 +----- kernel/auditsc.c | 4 ++-- 4 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 163b9a5d9441..1038e23eb61c 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -79,8 +79,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen); +extern int audit_compare_dname_path(const char *dname, const char *path); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, const void *payload, int size); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c22ec3d87bc..deb97c139e0c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -265,7 +265,7 @@ static void audit_update_watch(struct audit_parent *parent, /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) + if (audit_compare_dname_path(dname, owatch->path)) continue; /* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 71bb13598df3..ff4011c19b13 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1330,8 +1330,7 @@ int parent_len(const char *path) /* Compare given dentry name with last component in given path, * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen) +int audit_compare_dname_path(const char *dname, const char *path) { int dlen, plen; const char *p; @@ -1360,9 +1359,6 @@ int audit_compare_dname_path(const char *dname, const char *path, p++; } - /* return length of path's directory component */ - if (dirlen) - *dirlen = p - path; return strncmp(p, dname, dlen); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b87b28947acc..09c7b6b4f8e6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2222,7 +2222,7 @@ void __audit_inode_child(const struct inode *parent, continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, NULL)) { + !audit_compare_dname_path(dname, n->name)) { found_parent = n->name; goto add_names; } @@ -2235,7 +2235,7 @@ void __audit_inode_child(const struct inode *parent, /* strcmp() is the more likely scenario */ if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, NULL)) { + !audit_compare_dname_path(dname, n->name)) { if (inode) audit_copy_inode(n, dentry, inode); else -- cgit v1.2.2 From 29e9a3467c1367549568d7d411d5f30209ae181b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 10 Oct 2012 15:25:24 -0400 Subject: audit: make audit_compare_dname_path use parent_len helper Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/auditfilter.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ff4011c19b13..d705eb17661b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1332,32 +1332,19 @@ int parent_len(const char *path) * return of 0 indicates a match. */ int audit_compare_dname_path(const char *dname, const char *path) { - int dlen, plen; + int dlen, pathlen, parentlen; const char *p; - if (!dname || !path) - return 1; - dlen = strlen(dname); - plen = strlen(path); - if (plen < dlen) + pathlen = strlen(path); + if (pathlen < dlen) return 1; - /* disregard trailing slashes */ - p = path + plen - 1; - while ((*p == '/') && (p > path)) - p--; - - /* find last path component */ - p = p - dlen + 1; - if (p < path) + parentlen = parent_len(path); + if (pathlen - parentlen != dlen) return 1; - else if (p > path) { - if (*--p != '/') - return 1; - else - p++; - } + + p = path + parentlen; return strncmp(p, dname, dlen); } -- cgit v1.2.2 From e3d6b07b8ba161f638b026feba0c3c97875d7f1c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:25 -0400 Subject: audit: optimize audit_compare_dname_path In the cases where we already know the length of the parent, pass it as a parm so we don't need to recompute it. In the cases where we don't know the length, pass in AUDIT_NAME_FULL (-1) to indicate that it should be determined. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/audit.h | 5 ++++- kernel/audit_watch.c | 3 ++- kernel/auditfilter.c | 16 +++++++++++----- kernel/auditsc.c | 8 +++----- 4 files changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 1038e23eb61c..d51cba868e1b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,12 +74,15 @@ static inline int audit_hash_ino(u32 ino) return (ino & (AUDIT_INODE_BUCKETS-1)); } +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, int plen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, const void *payload, int size); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index deb97c139e0c..9a9ae6e3d290 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent, /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path)) + if (audit_compare_dname_path(dname, owatch->path, + AUDIT_NAME_FULL)) continue; /* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d705eb17661b..7f19f23d38a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1328,11 +1328,17 @@ int parent_len(const char *path) return p - path; } -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path) +/** + * audit_compare_dname_path - compare given dentry name with last component in + * given path. Return of 0 indicates a match. + * @dname: dentry name that we're comparing + * @path: full pathname that we're comparing + * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL + * here indicates that we must compute this value. + */ +int audit_compare_dname_path(const char *dname, const char *path, int parentlen) { - int dlen, pathlen, parentlen; + int dlen, pathlen; const char *p; dlen = strlen(dname); @@ -1340,7 +1346,7 @@ int audit_compare_dname_path(const char *dname, const char *path) if (pathlen < dlen) return 1; - parentlen = parent_len(path); + parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; if (pathlen - parentlen != dlen) return 1; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 09c7b6b4f8e6..0160a68b4d7f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -81,9 +81,6 @@ * a name dynamically and also add those to the list anchored by names_list. */ #define AUDIT_NAMES 5 -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -2222,7 +2219,7 @@ void __audit_inode_child(const struct inode *parent, continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name)) { + !audit_compare_dname_path(dname, n->name, n->name_len)) { found_parent = n->name; goto add_names; } @@ -2235,7 +2232,8 @@ void __audit_inode_child(const struct inode *parent, /* strcmp() is the more likely scenario */ if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name)) { + !audit_compare_dname_path(dname, n->name, + AUDIT_NAME_FULL)) { if (inode) audit_copy_inode(n, dentry, inode); else -- cgit v1.2.2 From 4fa6b5ecbf092c6ee752ece8a55d71f663d23254 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:25 -0400 Subject: audit: overhaul __audit_inode_child to accomodate retrying In order to accomodate retrying path-based syscalls, we need to add a new "type" argument to audit_inode_child. This will tell us whether we're looking for a child entry that represents a create or a delete. If we find a parent, don't automatically assume that we need to create a new entry. Instead, use the information we have to try to find an existing entry first. Update it if one is found and create a new one if not. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 57 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0160a68b4d7f..d147585e9ef3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2189,6 +2189,7 @@ out: * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent * @dentry: dentry being audited + * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -2199,13 +2200,13 @@ out: * unsuccessful attempts. */ void __audit_inode_child(const struct inode *parent, - const struct dentry *dentry) + const struct dentry *dentry, + const unsigned char type) { struct audit_context *context = current->audit_context; - const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; - struct audit_names *n; + struct audit_names *n, *found_parent = NULL, *found_child = NULL; if (!context->in_syscall) return; @@ -2213,63 +2214,65 @@ void __audit_inode_child(const struct inode *parent, if (inode) handle_one(inode); - /* parent is more likely, look for it first */ + /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + if (!n->name || n->type != AUDIT_TYPE_PARENT) continue; if (n->ino == parent->i_ino && !audit_compare_dname_path(dname, n->name, n->name_len)) { - found_parent = n->name; - goto add_names; + found_parent = n; + break; } } - /* no matching parent, look for matching child */ + /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + /* can only match entries that have a name */ + if (!n->name || n->type != type) + continue; + + /* if we found a parent, make sure this one is a child of it */ + if (found_parent && (n->name != found_parent->name)) continue; - /* strcmp() is the more likely scenario */ if (!strcmp(dname, n->name) || !audit_compare_dname_path(dname, n->name, + found_parent ? + found_parent->name_len : AUDIT_NAME_FULL)) { - if (inode) - audit_copy_inode(n, dentry, inode); - else - n->ino = (unsigned long)-1; - n->type = AUDIT_TYPE_NORMAL; - found_child = n->name; - goto add_names; + found_child = n; + break; } } -add_names: if (!found_parent) { - n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); + /* create a new, "anonymous" parent record */ + n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent); } if (!found_child) { - n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); - if (!n) + found_child = audit_alloc_name(context, type); + if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - n->name = found_parent; - n->name_len = AUDIT_NAME_FULL; + found_child->name = found_parent->name; + found_child->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - n->name_put = false; + found_child->name_put = false; } - - if (inode) - audit_copy_inode(n, dentry, inode); } + if (inode) + audit_copy_inode(found_child, dentry, inode); + else + found_child->ino = (unsigned long)-1; } EXPORT_SYMBOL_GPL(__audit_inode_child); -- cgit v1.2.2 From cfd4da175599938f21a81cdd80df02fa4151dcba Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:27 -0400 Subject: acct: constify the name arg to acct_on Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 6cd7529c9e6a..5be01017d30f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, } } -static int acct_on(char *name) +static int acct_on(const char *name) { struct file *file; struct vfsmount *mnt; -- cgit v1.2.2 From f30fed10c440a25937e509860fa207399b26efe5 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 12 Oct 2012 06:37:33 -0500 Subject: kgdb: Add module event hooks Allow gdb to auto load kernel modules when it is attached, which makes it trivially easy to debug module init functions or pre-set breakpoints in a kernel module that has not loaded yet. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..8bfa373cd5fd 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -688,6 +688,22 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } +/* + * GDB places a breakpoint at this function to know dynamically + * loaded objects. It's not defined static so that only one instance with this + * name exists in the kernel. + */ + +static int module_event(struct notifier_block *self, unsigned long val, + void *data) +{ + return 0; +} + +static struct notifier_block dbg_module_load_nb = { + .notifier_call = module_event, +}; + int kgdb_nmicallback(int cpu, void *regs) { #ifdef CONFIG_SMP @@ -816,6 +832,7 @@ static void kgdb_register_callbacks(void) kgdb_arch_init(); if (!dbg_is_early) kgdb_arch_late(); + register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_register(&panic_notifier_list, &kgdb_panic_event_nb); @@ -839,6 +856,7 @@ static void kgdb_unregister_callbacks(void) if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); + unregister_module_notifier(&dbg_module_load_nb); atomic_notifier_chain_unregister(&panic_notifier_list, &kgdb_panic_event_nb); kgdb_arch_exit(); -- cgit v1.2.2 From d1871b38fccdc4b6575b0cabdea9e06bc70167eb Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Sun, 26 Aug 2012 21:43:12 -0500 Subject: kdb: Fix dmesg/bta scroll to quit with 'q' If you press 'q' the pager should exit instead of printing everything from dmesg which can really bog down a 9600 baud serial link. The same is true for the bta command. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_bt.c | 2 ++ kernel/debug/kdb/kdb_main.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 07c9bbb94a0b..b03e0e814e43 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv) } /* Now the inactive tasks */ kdb_do_each_thread(g, p) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, argcount, btaprompt)) diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a9..1afeb5c1e5a9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2100,6 +2100,8 @@ static int kdb_dmesg(int argc, const char **argv) } if (!lines--) break; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; kdb_printf("%.*s\n", (int)len - 1, buf); } -- cgit v1.2.2 From 17b572e82032bc246324ce136696656b66d4e3f1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Sun, 26 Aug 2012 22:37:03 -0500 Subject: kdb,vt_console: Fix missed data due to pager overruns It is possible to miss data when using the kdb pager. The kdb pager does not pay attention to the maximum column constraint of the screen or serial terminal. This result is not incrementing the shown lines correctly and the pager will print more lines that fit on the screen. Obviously that is less than useful when using a VGA console where you cannot scroll back. The pager will now look at the kdb_buffer string to see how many characters are printed. It might not be perfect considering you can output ASCII that might move the cursor position, but it is a substantially better approximation for viewing dmesg and trace logs. This also means that the vt screen needs to set the kdb COLUMNS variable. Cc: Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0a69d2adc4f3..14ff4849262c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap) { int diag; int linecount; + int colcount; int logging, saved_loglevel = 0; int saved_trap_printk; int got_printf_lock = 0; @@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap) if (diag || linecount <= 1) linecount = 24; + diag = kdbgetintenv("COLUMNS", &colcount); + if (diag || colcount <= 1) + colcount = 80; + diag = kdbgetintenv("LOGGING", &logging); if (diag) logging = 0; @@ -690,7 +695,7 @@ kdb_printit: gdbstub_msg_write(kdb_buffer, retlen); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = strlen(kdb_buffer); + len = retlen; cp = kdb_buffer; while (len--) { dbg_io_ops->write_char(*cp); @@ -709,11 +714,29 @@ kdb_printit: printk(KERN_INFO "%s", kdb_buffer); } - if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) - kdb_nextline++; + if (KDB_STATE(PAGER)) { + /* + * Check printed string to decide how to bump the + * kdb_nextline to control when the more prompt should + * show up. + */ + int got = 0; + len = retlen; + while (len--) { + if (kdb_buffer[len] == '\n') { + kdb_nextline++; + got = 0; + } else if (kdb_buffer[len] == '\r') { + got = 0; + } else { + got++; + } + } + kdb_nextline += got / (colcount + 1); + } /* check for having reached the LINES number of printed lines */ - if (kdb_nextline == linecount) { + if (kdb_nextline >= linecount) { char buf1[16] = ""; /* Watch out for recursion here. Any routine that calls @@ -765,7 +788,7 @@ kdb_printit: kdb_grepping_flag = 0; kdb_printf("\n"); } else if (buf1[0] == ' ') { - kdb_printf("\n"); + kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ } else if (buf1[0] == '\n') { kdb_nextline = linecount - 1; -- cgit v1.2.2 From a74fb73c12398b250fdc5e333a11e15a9e3a84fc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Oct 2012 21:28:25 -0400 Subject: infrastructure for saner ret_from_kernel_thread semantics * allow kernel_execve() leave the actual return to userland to caller (selected by CONFIG_GENERIC_KERNEL_EXECVE). Callers updated accordingly. * architecture that does select GENERIC_KERNEL_EXECVE in its Kconfig should have its ret_from_kernel_thread() do this: call schedule_tail call the callback left for it by copy_thread(); if it ever returns, that's because it has just done successful kernel_execve() jump to return from syscall IOW, its only difference from ret_from_fork() is that it does call the callback. * such an architecture should also get rid of ret_from_kernel_execve() and __ARCH_WANT_KERNEL_EXECVE This is the last part of infrastructure patches in that area - from that point on work on different architectures can live independently. Signed-off-by: Al Viro --- kernel/kmod.c | 3 +++ kernel/kthread.c | 1 + 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index b6e5ca9c758a..1c317e386831 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -221,6 +222,8 @@ static int ____call_usermodehelper(void *data) retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); + if (!retval) + return 0; /* Exec failed? */ fail: diff --git a/kernel/kthread.c b/kernel/kthread.c index b579af57ea10..7ba65c1aa6b3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); -- cgit v1.2.2 From 91a27b2a756784714e924e5e854b919273082d26 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:28 -0400 Subject: vfs: define struct filename and have getname() return it getname() is intended to copy pathname strings from userspace into a kernel buffer. The result is just a string in kernel space. It would however be quite helpful to be able to attach some ancillary info to the string. For instance, we could attach some audit-related info to reduce the amount of audit-related processing needed. When auditing is enabled, we could also call getname() on the string more than once and not need to recopy it from userspace. This patchset converts the getname()/putname() interfaces to return a struct instead of a string. For now, the struct just tracks the string in kernel space and the original userland pointer for it. Later, we'll add other information to the struct as it becomes convenient. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/acct.c | 4 ++-- kernel/auditsc.c | 64 +++++++++++++++++++++++++++++++------------------------- 2 files changed, 37 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 5be01017d30f..08354195eecc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -260,10 +260,10 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - char *tmp = getname(name); + struct filename *tmp = getname(name); if (IS_ERR(tmp)) return (PTR_ERR(tmp)); - error = acct_on(tmp); + error = acct_on(tmp->name); putname(tmp); } else { struct bsd_acct_struct *acct; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d147585e9ef3..d4d82319eed5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -103,28 +103,29 @@ struct audit_cap_data { * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ struct audit_names { - struct list_head list; /* audit_context->names_list */ - const char *name; - unsigned long ino; - dev_t dev; - umode_t mode; - kuid_t uid; - kgid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - unsigned char type; /* record type */ - bool name_put; /* call __putname() for this name */ + struct list_head list; /* audit_context->names_list */ + struct filename *name; + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + unsigned char type; /* record type */ + bool name_put; /* call __putname() for this name */ /* * This was an allocated audit_names and not from the array of * names allocated in the task audit context. Thus this name * should be freed on syscall exit */ - bool should_free; + bool should_free; }; struct audit_aux_data { @@ -996,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count); list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } dump_stack(); return; @@ -1553,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); + audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the @@ -1563,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, default: /* log the name's directory component */ audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, + audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else @@ -2026,7 +2027,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void __audit_getname(const char *name) +void __audit_getname(struct filename *name) { struct audit_context *context = current->audit_context; struct audit_names *n; @@ -2040,6 +2041,11 @@ void __audit_getname(const char *name) return; } +#if AUDIT_DEBUG + /* The filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; @@ -2059,7 +2065,7 @@ void __audit_getname(const char *name) * then we delay the putname until syscall exit. * Called from include/linux/fs.h:putname(). */ -void audit_putname(const char *name) +void audit_putname(struct filename *name) { struct audit_context *context = current->audit_context; @@ -2074,7 +2080,7 @@ void audit_putname(const char *name) list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } #endif __putname(name); @@ -2088,8 +2094,8 @@ void audit_putname(const char *name) " put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, - context->in_syscall, name, context->name_count, - context->put_count); + context->in_syscall, name->name, + context->name_count, context->put_count); dump_stack(); } } @@ -2152,7 +2158,7 @@ void __audit_inode(const char *name, const struct dentry *dentry, list_for_each_entry_reverse(n, &context->names_list, list) { /* does the name pointer match? */ - if (n->name != name) + if (!n->name || n->name->name != name) continue; /* match the correct record type */ @@ -2175,7 +2181,7 @@ out_alloc: return; out: if (parent) { - n->name_len = n->name ? parent_len(n->name) : AUDIT_NAME_FULL; + n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; } else { n->name_len = AUDIT_NAME_FULL; @@ -2220,7 +2226,7 @@ void __audit_inode_child(const struct inode *parent, continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, n->name_len)) { + !audit_compare_dname_path(dname, n->name->name, n->name_len)) { found_parent = n; break; } @@ -2236,8 +2242,8 @@ void __audit_inode_child(const struct inode *parent, if (found_parent && (n->name != found_parent->name)) continue; - if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, + if (!strcmp(dname, n->name->name) || + !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : AUDIT_NAME_FULL)) { -- cgit v1.2.2 From 7ac86265dc8f665cc49d6e60a125e608cd2fca14 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:28 -0400 Subject: audit: allow audit code to satisfy getname requests from its names_list Currently, if we call getname() on a userland string more than once, we'll get multiple copies of the string and multiple audit_names records. Add a function that will allow the audit_names code to satisfy getname requests using info from the audit_names list, avoiding a new allocation and audit_names records. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d4d82319eed5..521163a5d65f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2020,6 +2020,29 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, return aname; } +/** + * audit_reusename - fill out filename with info from existing entry + * @uptr: userland ptr to pathname + * + * Search the audit_names list for the current audit context. If there is an + * existing entry with a matching "uptr" then return the filename + * associated with that audit_name. If not, return NULL. + */ +struct filename * +__audit_reusename(const __user char *uptr) +{ + struct audit_context *context = current->audit_context; + struct audit_names *n; + + list_for_each_entry(n, &context->names_list, list) { + if (!n->name) + continue; + if (n->name->uptr == uptr) + return n->name; + } + return NULL; +} + /** * audit_getname - add a name to the list * @name: name to add -- cgit v1.2.2 From 669abf4e5539c8aa48bf28c965be05c0a7b58a27 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 16:43:10 -0400 Subject: vfs: make path_openat take a struct filename pointer ...and fix up the callers. For do_file_open_root, just declare a struct filename on the stack and fill out the .name field. For do_filp_open, make it also take a struct filename pointer, and fix up its callers to call it appropriately. For filp_open, add a variant that takes a struct filename pointer and turn filp_open into a wrapper around it. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/acct.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 08354195eecc..051e071a06e7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, } } -static int acct_on(const char *name) +static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; @@ -201,7 +201,7 @@ static int acct_on(const char *name) struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); @@ -263,7 +263,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) struct filename *tmp = getname(name); if (IS_ERR(tmp)) return (PTR_ERR(tmp)); - error = acct_on(tmp->name); + error = acct_on(tmp); putname(tmp); } else { struct bsd_acct_struct *acct; -- cgit v1.2.2 From adb5c2473d3f91526c79db972aafb20a56d3fbb3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 16:43:13 -0400 Subject: audit: make audit_inode take struct filename Keep a pointer to the audit_names "slot" in struct filename. Have all of the audit_inode callers pass a struct filename ponter to audit_inode instead of a string pointer. If the aname field is already populated, then we can skip walking the list altogether and just use it directly. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- kernel/auditsc.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 521163a5d65f..2f186ed80c40 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2076,6 +2076,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; n->name_put = true; + name->aname = n; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); @@ -2166,7 +2167,7 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent * @dentry: dentry being audited * @parent: does this dentry represent the parent? */ -void __audit_inode(const char *name, const struct dentry *dentry, +void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int parent) { struct audit_context *context = current->audit_context; @@ -2179,9 +2180,29 @@ void __audit_inode(const char *name, const struct dentry *dentry, if (!name) goto out_alloc; +#if AUDIT_DEBUG + /* The struct filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + /* + * If we have a pointer to an audit_names entry already, then we can + * just use it directly if the type is correct. + */ + n = name->aname; + if (n) { + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } + } + list_for_each_entry_reverse(n, &context->names_list, list) { /* does the name pointer match? */ - if (!n->name || n->name->name != name) + if (!n->name || n->name->name != name->name) continue; /* match the correct record type */ -- cgit v1.2.2 From 1f5320d5972aa50d3e8d2b227b636b370e608359 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 4 Oct 2012 16:37:16 +0900 Subject: cgroup: notify_on_release may not be triggered in some cases notify_on_release must be triggered when the last process in a cgroup is move to another. But if the first(and only) process in a cgroup is moved to another, notify_on_release is not triggered. # mkdir /cgroup/cpu/SRC # mkdir /cgroup/cpu/DST # # echo 1 >/cgroup/cpu/SRC/notify_on_release # echo 1 >/cgroup/cpu/DST/notify_on_release # # sleep 300 & [1] 8629 # # echo 8629 >/cgroup/cpu/SRC/tasks # echo 8629 >/cgroup/cpu/DST/tasks -> notify_on_release for /SRC must be triggered at this point, but it isn't. This is because put_css_set() is called before setting CGRP_RELEASABLE in cgroup_task_migrate(), and is a regression introduce by the commit:74a1166d(cgroups: make procs file writable), which was merged into v3.0. Cc: Ben Blum Cc: # v3.0.x and later Acked-by: Li Zefan Signed-off-by: Daisuke Nishimura Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13774b3b39aa..d1739fc7eb94 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1962,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * trading it for newcg is protected by cgroup_mutex, we're safe to drop * it here; it will be freed under RCU. */ - put_css_set(oldcg); - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + put_css_set(oldcg); } /** -- cgit v1.2.2 From 85eae82a0855d49852b87deac8653e4ebc8b291f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2012 21:35:59 -0700 Subject: printk: Fix scheduling-while-atomic problem in console_cpu_notify() The console_cpu_notify() function runs with interrupts disabled in the CPU_DYING case. It therefore cannot block, for example, as will happen when it calls console_lock(). Therefore, remove the CPU_DYING leg of the switch statement to avoid this problem. Signed-off-by: Paul E. McKenney Reviewed-by: Srivatsa S. Bhat Signed-off-by: Linus Torvalds --- kernel/printk.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 66a2ea37b576..2d607f4d1797 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, switch (action) { case CPU_ONLINE: case CPU_DEAD: - case CPU_DYING: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: console_lock(); -- cgit v1.2.2 From 2702b1526c7278c4d65d78de209a465d4de2885e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Oct 2012 13:56:51 -0700 Subject: kernel/sys.c: fix stack memory content leak via UNAME26 Calling uname() with the UNAME26 personality set allows a leak of kernel stack contents. This fixes it by defensively calculating the length of copy_to_user() call, making the len argument unsigned, and initializing the stack buffer to zero (now technically unneeded, but hey, overkill). CVE-2012-0957 Reported-by: PaX Team Signed-off-by: Kees Cook Cc: Andi Kleen Cc: PaX Team Cc: Brad Spengler Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index c5cb5b99cb81..01865c6fb6a0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1265,15 +1265,16 @@ DECLARE_RWSEM(uts_sem); * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 */ -static int override_release(char __user *release, int len) +static int override_release(char __user *release, size_t len) { int ret = 0; - char buf[65]; if (current->personality & UNAME26) { - char *rest = UTS_RELEASE; + const char *rest = UTS_RELEASE; + char buf[65] = { 0 }; int ndots = 0; unsigned v; + size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) @@ -1283,8 +1284,9 @@ static int override_release(char __user *release, int len) rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - snprintf(buf, len, "2.6.%u%s", v, rest); - ret = copy_to_user(release, buf, len); + copy = min(sizeof(buf), max_t(size_t, 1, len)); + copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, copy + 1); } return ret; } -- cgit v1.2.2 From bbc2e3ef87851bc5430b2b4cf4ca3a2f29baeda6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 19 Oct 2012 13:56:53 -0700 Subject: pidns: remove recursion from free_pid_ns() free_pid_ns() operates in a recursive fashion: free_pid_ns(parent) put_pid_ns(parent) kref_put(&ns->kref, free_pid_ns); free_pid_ns thus if there was a huge nesting of namespaces the userspace may trigger avalanche calling of free_pid_ns leading to kernel stack exhausting and a panic eventually. This patch turns the recursion into an iterative loop. Based on a patch by Andrew Vagin. [akpm@linux-foundation.org: export put_pid_ns() to modules] Signed-off-by: Cyrill Gorcunov Cc: Andrew Vagin Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 478bad2745e3..eb00be205811 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -133,19 +133,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old return create_pid_namespace(old_ns); } -void free_pid_ns(struct kref *kref) +static void free_pid_ns(struct kref *kref) { - struct pid_namespace *ns, *parent; + struct pid_namespace *ns; ns = container_of(kref, struct pid_namespace, kref); - - parent = ns->parent; destroy_pid_namespace(ns); +} - if (parent != NULL) - put_pid_ns(parent); +void put_pid_ns(struct pid_namespace *ns) +{ + struct pid_namespace *parent; + + while (ns != &init_pid_ns) { + parent = ns->parent; + if (!kref_put(&ns->kref, free_pid_ns)) + break; + ns = parent; + } } -EXPORT_SYMBOL_GPL(free_pid_ns); +EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { -- cgit v1.2.2 From 9bb71308b8133d643648776243e4d5599b1c193d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Oct 2012 17:52:07 -0700 Subject: Revert "cgroup: Drop task_lock(parent) on cgroup_fork()" This reverts commit 7e381b0eb1e1a9805c37335562e8dc02e7d7848c. The commit incorrectly assumed that fork path always performed threadgroup_change_begin/end() and depended on that for synchronization against task exit and cgroup migration paths instead of explicitly grabbing task_lock(). threadgroup_change is not locked when forking a new process (as opposed to a new thread in the same process) and even if it were it wouldn't be effective as different processes use different threadgroup locks. Revert the incorrect optimization. Signed-off-by: Tejun Heo LKML-Reference: <20121008020000.GB2575@localhost> Acked-by: Li Zefan Bitterly-Acked-by: Frederic Weisbecker Cc: stable@vger.kernel.org --- kernel/cgroup.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1739fc7eb94..75aec12c78a0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4814,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer. cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - /* - * We don't need to task_lock() current because current->cgroups - * can't be changed concurrently here. The parent obviously hasn't - * exited and called cgroup_exit(), and we are synchronized against - * cgroup migration through threadgroup_change_begin(). - */ + task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); + task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } -- cgit v1.2.2 From d87838321124061f6c935069d97f37010fa417e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Oct 2012 17:40:30 -0700 Subject: Revert "cgroup: Remove task_lock() from cgroup_post_fork()" This reverts commit 7e3aa30ac8c904a706518b725c451bb486daaae9. The commit incorrectly assumed that fork path always performed threadgroup_change_begin/end() and depended on that for synchronization against task exit and cgroup migration paths instead of explicitly grabbing task_lock(). threadgroup_change is not locked when forking a new process (as opposed to a new thread in the same process) and even if it were it wouldn't be effective as different processes use different threadgroup locks. Revert the incorrect optimization. Signed-off-by: Tejun Heo LKML-Reference: <20121008020000.GB2575@localhost> Acked-by: Li Zefan Cc: Frederic Weisbecker Cc: stable@vger.kernel.org --- kernel/cgroup.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75aec12c78a0..f24f724620dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child) */ if (use_task_css_set_links) { write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) { - /* - * It's safe to use child->cgroups without task_lock() - * here because we are protected through - * threadgroup_change_begin() against concurrent - * css_set change in cgroup_task_migrate(). Also - * the task can't exit at that point until - * wake_up_new_task() is called, so we are protected - * against cgroup_exit() setting child->cgroup to - * init_css_set. - */ + task_lock(child); + if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); - } + task_unlock(child); write_unlock(&css_set_lock); } } -- cgit v1.2.2 From caabe240574aec05b2f5667414ce80f9075c2ba1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 20 Oct 2012 01:19:29 +0100 Subject: MODSIGN: Move the magic string to the end of a module and eliminate the search Emit the magic string that indicates a module has a signature after the signature data instead of before it. This allows module_sig_check() to be made simpler and faster by the elimination of the search for the magic string. Instead we just need to do a single memcmp(). This works because at the end of the signature data there is the fixed-length signature information block. This block then falls immediately prior to the magic number. From the contents of the information block, it is trivial to calculate the size of the signature data and thus the size of the actual module data. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/module-internal.h | 3 +-- kernel/module.c | 26 +++++++++----------------- kernel/module_signing.c | 24 +++++++++++++++--------- 3 files changed, 25 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 6114a13419bd..24f9247b7d02 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -11,5 +11,4 @@ extern struct key *modsign_keyring; -extern int mod_verify_sig(const void *mod, unsigned long modlen, - const void *sig, unsigned long siglen); +extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index 0e2da8695f8e..6085f5ef88ea 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2421,25 +2421,17 @@ static inline void kmemleak_load_module(const struct module *mod, #ifdef CONFIG_MODULE_SIG static int module_sig_check(struct load_info *info, - const void *mod, unsigned long *len) + const void *mod, unsigned long *_len) { int err = -ENOKEY; - const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - const void *p = mod, *end = mod + *len; - - /* Poor man's memmem. */ - while ((p = memchr(p, MODULE_SIG_STRING[0], end - p))) { - if (p + markerlen > end) - break; - - if (memcmp(p, MODULE_SIG_STRING, markerlen) == 0) { - const void *sig = p + markerlen; - /* Truncate module up to signature. */ - *len = p - mod; - err = mod_verify_sig(mod, *len, sig, end - sig); - break; - } - p++; + unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + unsigned long len = *_len; + + if (len > markerlen && + memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + *_len -= markerlen; + err = mod_verify_sig(mod, _len); } if (!err) { diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6b09f6983ac0..d492a23df99c 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -183,27 +183,33 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, /* * Verify the signature on a module. */ -int mod_verify_sig(const void *mod, unsigned long modlen, - const void *sig, unsigned long siglen) +int mod_verify_sig(const void *mod, unsigned long *_modlen) { struct public_key_signature *pks; struct module_signature ms; struct key *key; - size_t sig_len; + const void *sig; + size_t modlen = *_modlen, sig_len; int ret; - pr_devel("==>%s(,%lu,,%lu,)\n", __func__, modlen, siglen); + pr_devel("==>%s(,%lu)\n", __func__, modlen); - if (siglen <= sizeof(ms)) + if (modlen <= sizeof(ms)) return -EBADMSG; - memcpy(&ms, sig + (siglen - sizeof(ms)), sizeof(ms)); - siglen -= sizeof(ms); + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + modlen -= sizeof(ms); sig_len = be32_to_cpu(ms.sig_len); - if (sig_len >= siglen || - siglen - sig_len != (size_t)ms.signer_len + ms.key_id_len) + if (sig_len >= modlen) return -EBADMSG; + modlen -= sig_len; + if ((size_t)ms.signer_len + ms.key_id_len >= modlen) + return -EBADMSG; + modlen -= (size_t)ms.signer_len + ms.key_id_len; + + *_modlen = modlen; + sig = mod + modlen; /* For the moment, only support RSA and X.509 identifiers */ if (ms.algo != PKEY_ALGO_RSA || -- cgit v1.2.2 From 31fd84b95eb211d5db460a1dda85e004800a7b52 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Oct 2012 18:45:53 -0700 Subject: use clamp_t in UNAME26 fix The min/max call needed to have explicit types on some architectures (e.g. mn10300). Use clamp_t instead to avoid the warning: kernel/sys.c: In function 'override_release': kernel/sys.c:1287:10: warning: comparison of distinct pointer types lacks a cast [enabled by default] Reported-by: Fengguang Wu Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 01865c6fb6a0..e6e0ece5f6a0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1284,7 +1284,7 @@ static int override_release(char __user *release, size_t len) rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - copy = min(sizeof(buf), max_t(size_t, 1, len)); + copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); } -- cgit v1.2.2 From 0390c8835690506802fd5d54ea5444f0b9b1708b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Oct 2012 18:59:31 -0700 Subject: module_signing: fix printk format warning Fix the warning: kernel/module_signing.c:195:2: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'size_t' by using the proper 'z' modifier for printing a size_t. Signed-off-by: Randy Dunlap Cc: David Howells Signed-off-by: Linus Torvalds --- kernel/module_signing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index d492a23df99c..ea1b1df5dbb0 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -192,7 +192,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) size_t modlen = *_modlen, sig_len; int ret; - pr_devel("==>%s(,%lu)\n", __func__, modlen); + pr_devel("==>%s(,%zu)\n", __func__, modlen); if (modlen <= sizeof(ms)) return -EBADMSG; -- cgit v1.2.2 From c0158ca64da5732dfb86a3f28944e9626776692f Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Thu, 18 Oct 2012 16:31:37 -0700 Subject: workqueue: cancel_delayed_work() should return %false if work item is idle 57b30ae77b ("workqueue: reimplement cancel_delayed_work() using try_to_grab_pending()") made cancel_delayed_work() always return %true unless someone else is also trying to cancel the work item, which is broken - if the target work item is idle, the return value should be %false. try_to_grab_pending() indicates that the target work item was idle by zero return value. Use it for return. Note that this brings cancel_delayed_work() in line with __cancel_work_timer() in return value handling. Signed-off-by: Dan Magenheimer Signed-off-by: Tejun Heo LKML-Reference: <444a6439-b1a4-4740-9e7e-bc37267cfe73@default> --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d951daa0ca9a..042d221d33cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2982,7 +2982,7 @@ bool cancel_delayed_work(struct delayed_work *dwork) set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); local_irq_restore(flags); - return true; + return ret; } EXPORT_SYMBOL(cancel_delayed_work); -- cgit v1.2.2 From f2302505775fd13ba93f034206f1e2a587017929 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Thu, 25 Oct 2012 13:38:07 -0700 Subject: pidns: limit the nesting depth of pid namespaces 'struct pid' is a "variable sized struct" - a header with an array of upids at the end. The size of the array depends on a level (depth) of pid namespaces. Now a level of pidns is not limited, so 'struct pid' can be more than one page. Looks reasonable, that it should be less than a page. MAX_PIS_NS_LEVEL is not calculated from PAGE_SIZE, because in this case it depends on architectures, config options and it will be reduced, if someone adds a new fields in struct pid or struct upid. I suggest to set MAX_PIS_NS_LEVEL = 32, because it saves ability to expand "struct pid" and it's more than enough for all known for me use-cases. When someone finds a reasonable use case, we can add a config option or a sysctl parameter. In addition it will reduce the effect of another problem, when we have many nested namespaces and the oldest one starts dying. zap_pid_ns_processe will be called for each namespace and find_vpid will be called for each process in a namespace. find_vpid will be called minimum max_level^2 / 2 times. The reason of that is that when we found a bit in pidmap, we can't determine this pidns is top for this process or it isn't. vpid is a heavy operation, so a fork bomb, which create many nested namespace, can make a system inaccessible for a long time. For example my system becomes inaccessible for a few minutes with 4000 processes. [akpm@linux-foundation.org: return -EINVAL in response to excessive nesting, not -ENOMEM] Signed-off-by: Andrew Vagin Acked-by: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index eb00be205811..7b07cc0dfb75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -71,12 +71,22 @@ err_alloc: return NULL; } +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; + int i; + int err; + + if (level > MAX_PID_NS_LEVEL) { + err = -EINVAL; + goto out; + } + err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; -- cgit v1.2.2 From 2008713c7174e5c0f207bac684c6df0939047009 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 24 Oct 2012 14:11:48 -0700 Subject: Makefile: Documentation for external tool should be correct If one includes documentation for an external tool, it should be correct. This is not: 1. Overriding the input to rngd should typically be neither necessary nor desired. This is especially so since newer versions of rngd support a number of different *types* of sources. 2. The default kernel-exported device is called /dev/hwrng not /dev/hwrandom nor /dev/hw_random (both of which were used in the past; however, kernel and udev seem to have converged on /dev/hwrng.) Overall it is better if the documentation for rngd is kept with rngd rather than in a kernel Makefile. Signed-off-by: H. Peter Anvin Cc: David Howells Cc: Jeff Garzik Signed-off-by: Linus Torvalds --- kernel/Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0dfeca4324ee..86e3285ae7e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -174,10 +174,8 @@ signing_key.priv signing_key.x509: x509.genkey @echo "###" @echo "### If this takes a long time, you might wish to run rngd in the" @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and should use a hardware random" - @echo "### number generator if one is available, eg:" - @echo "###" - @echo "### rngd -r /dev/hwrandom" + @echo "### needs to be run as root, and uses a hardware random" + @echo "### number generator if one is available." @echo "###" openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ -x509 -config x509.genkey \ -- cgit v1.2.2