aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2014-06-27 16:42:20 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2014-09-07 19:27:19 -0400
commit8315f42295d2667a7f942f154b73a86fd7cb2227 (patch)
tree67057935dada3305e0dab95f546359b40cc29b96
parent11ed7f934cb807f26da09547b5946c2e534d1dac (diff)
rcu: Add call_rcu_tasks()
This commit adds a new RCU-tasks flavor of RCU, which provides call_rcu_tasks(). This RCU flavor's quiescent states are voluntary context switch (not preemption!) and userspace execution (not the idle loop -- use some sort of schedule_on_each_cpu() if you need to handle the idle tasks. Note that unlike other RCU flavors, these quiescent states occur in tasks, not necessarily CPUs. Includes fixes from Steven Rostedt. This RCU flavor is assumed to have very infrequent latency-tolerant updaters. This assumption permits significant simplifications, including a single global callback list protected by a single global lock, along with a single task-private linked list containing all tasks that have not yet passed through a quiescent state. If experience shows this assumption to be incorrect, the required additional complexity will be added. Suggested-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/rcupdate.h36
-rw-r--r--include/linux/sched.h23
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/update.c171
7 files changed, 242 insertions, 11 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2bb4c4f3531a..dffd9258ee60 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -117,6 +117,14 @@ extern struct group_info init_groups;
117#else 117#else
118#define INIT_TASK_RCU_PREEMPT(tsk) 118#define INIT_TASK_RCU_PREEMPT(tsk)
119#endif 119#endif
120#ifdef CONFIG_TASKS_RCU
121#define INIT_TASK_RCU_TASKS(tsk) \
122 .rcu_tasks_holdout = false, \
123 .rcu_tasks_holdout_list = \
124 LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list),
125#else
126#define INIT_TASK_RCU_TASKS(tsk)
127#endif
120 128
121extern struct cred init_cred; 129extern struct cred init_cred;
122 130
@@ -224,6 +232,7 @@ extern struct task_group root_task_group;
224 INIT_FTRACE_GRAPH \ 232 INIT_FTRACE_GRAPH \
225 INIT_TRACE_RECURSION \ 233 INIT_TRACE_RECURSION \
226 INIT_TASK_RCU_PREEMPT(tsk) \ 234 INIT_TASK_RCU_PREEMPT(tsk) \
235 INIT_TASK_RCU_TASKS(tsk) \
227 INIT_CPUSET_SEQ(tsk) \ 236 INIT_CPUSET_SEQ(tsk) \
228 INIT_RT_MUTEXES(tsk) \ 237 INIT_RT_MUTEXES(tsk) \
229 INIT_VTIME(tsk) \ 238 INIT_VTIME(tsk) \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d231aa17b1d7..3432063f4c87 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -197,6 +197,26 @@ void call_rcu_sched(struct rcu_head *head,
197 197
198void synchronize_sched(void); 198void synchronize_sched(void);
199 199
200/**
201 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
202 * @head: structure to be used for queueing the RCU updates.
203 * @func: actual callback function to be invoked after the grace period
204 *
205 * The callback function will be invoked some time after a full grace
206 * period elapses, in other words after all currently executing RCU
207 * read-side critical sections have completed. call_rcu_tasks() assumes
208 * that the read-side critical sections end at a voluntary context
209 * switch (not a preemption!), entry into idle, or transition to usermode
210 * execution. As such, there are no read-side primitives analogous to
211 * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
212 * to determine that all tasks have passed through a safe state, not so
213 * much for data-strcuture synchronization.
214 *
215 * See the description of call_rcu() for more detailed information on
216 * memory ordering guarantees.
217 */
218void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head));
219
200#ifdef CONFIG_PREEMPT_RCU 220#ifdef CONFIG_PREEMPT_RCU
201 221
202void __rcu_read_lock(void); 222void __rcu_read_lock(void);
@@ -294,6 +314,22 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
294 rcu_irq_exit(); \ 314 rcu_irq_exit(); \
295 } while (0) 315 } while (0)
296 316
317/*
318 * Note a voluntary context switch for RCU-tasks benefit. This is a
319 * macro rather than an inline function to avoid #include hell.
320 */
321#ifdef CONFIG_TASKS_RCU
322#define rcu_note_voluntary_context_switch(t) \
323 do { \
324 preempt_disable(); /* Exclude synchronize_sched(); */ \
325 if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
326 ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
327 preempt_enable(); \
328 } while (0)
329#else /* #ifdef CONFIG_TASKS_RCU */
330#define rcu_note_voluntary_context_switch(t) do { } while (0)
331#endif /* #else #ifdef CONFIG_TASKS_RCU */
332
297#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) 333#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
298bool __rcu_is_watching(void); 334bool __rcu_is_watching(void);
299#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ 335#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885ee52b..eaacac4ae77d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1270,6 +1270,11 @@ struct task_struct {
1270#ifdef CONFIG_TREE_PREEMPT_RCU 1270#ifdef CONFIG_TREE_PREEMPT_RCU
1271 struct rcu_node *rcu_blocked_node; 1271 struct rcu_node *rcu_blocked_node;
1272#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1272#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1273#ifdef CONFIG_TASKS_RCU
1274 unsigned long rcu_tasks_nvcsw;
1275 bool rcu_tasks_holdout;
1276 struct list_head rcu_tasks_holdout_list;
1277#endif /* #ifdef CONFIG_TASKS_RCU */
1273 1278
1274#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1279#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1275 struct sched_info sched_info; 1280 struct sched_info sched_info;
@@ -2000,28 +2005,24 @@ extern void task_clear_jobctl_pending(struct task_struct *task,
2000 unsigned int mask); 2005 unsigned int mask);
2001 2006
2002#ifdef CONFIG_PREEMPT_RCU 2007#ifdef CONFIG_PREEMPT_RCU
2003
2004#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ 2008#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
2005#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ 2009#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
2010#endif /* #ifdef CONFIG_PREEMPT_RCU */
2006 2011
2007static inline void rcu_copy_process(struct task_struct *p) 2012static inline void rcu_copy_process(struct task_struct *p)
2008{ 2013{
2014#ifdef CONFIG_PREEMPT_RCU
2009 p->rcu_read_lock_nesting = 0; 2015 p->rcu_read_lock_nesting = 0;
2010 p->rcu_read_unlock_special = 0; 2016 p->rcu_read_unlock_special = 0;
2011#ifdef CONFIG_TREE_PREEMPT_RCU
2012 p->rcu_blocked_node = NULL; 2017 p->rcu_blocked_node = NULL;
2013#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2014 INIT_LIST_HEAD(&p->rcu_node_entry); 2018 INIT_LIST_HEAD(&p->rcu_node_entry);
2019#endif /* #ifdef CONFIG_PREEMPT_RCU */
2020#ifdef CONFIG_TASKS_RCU
2021 p->rcu_tasks_holdout = false;
2022 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
2023#endif /* #ifdef CONFIG_TASKS_RCU */
2015} 2024}
2016 2025
2017#else
2018
2019static inline void rcu_copy_process(struct task_struct *p)
2020{
2021}
2022
2023#endif
2024
2025static inline void tsk_restore_flags(struct task_struct *task, 2026static inline void tsk_restore_flags(struct task_struct *task,
2026 unsigned long orig_flags, unsigned long flags) 2027 unsigned long orig_flags, unsigned long flags)
2027{ 2028{
diff --git a/init/Kconfig b/init/Kconfig
index e84c6423a2e5..c4539c4e177f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -507,6 +507,16 @@ config PREEMPT_RCU
507 This option enables preemptible-RCU code that is common between 507 This option enables preemptible-RCU code that is common between
508 TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU. 508 TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
509 509
510config TASKS_RCU
511 bool "Task_based RCU implementation using voluntary context switch"
512 default n
513 help
514 This option enables a task-based RCU implementation that uses
515 only voluntary context switch (not preemption!), idle, and
516 user-mode execution as quiescent states.
517
518 If unsure, say N.
519
510config RCU_STALL_COMMON 520config RCU_STALL_COMMON
511 def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) 521 def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
512 help 522 help
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..717f00854fc0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -254,6 +254,8 @@ void rcu_check_callbacks(int cpu, int user)
254 rcu_sched_qs(cpu); 254 rcu_sched_qs(cpu);
255 else if (!in_softirq()) 255 else if (!in_softirq())
256 rcu_bh_qs(cpu); 256 rcu_bh_qs(cpu);
257 if (user)
258 rcu_note_voluntary_context_switch(current);
257} 259}
258 260
259/* 261/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1b70cb6fbe3c..8ad91d1e317d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user)
2410 rcu_preempt_check_callbacks(cpu); 2410 rcu_preempt_check_callbacks(cpu);
2411 if (rcu_pending(cpu)) 2411 if (rcu_pending(cpu))
2412 invoke_rcu_core(); 2412 invoke_rcu_core();
2413 if (user)
2414 rcu_note_voluntary_context_switch(current);
2413 trace_rcu_utilization(TPS("End scheduler-tick")); 2415 trace_rcu_utilization(TPS("End scheduler-tick"));
2414} 2416}
2415 2417
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4056d7992a6c..19b3dacb0753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,7 @@
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kthread.h>
50 51
51#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
52 53
@@ -347,3 +348,173 @@ static int __init check_cpu_stall_init(void)
347early_initcall(check_cpu_stall_init); 348early_initcall(check_cpu_stall_init);
348 349
349#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 350#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
351
352#ifdef CONFIG_TASKS_RCU
353
354/*
355 * Simple variant of RCU whose quiescent states are voluntary context switch,
356 * user-space execution, and idle. As such, grace periods can take one good
357 * long time. There are no read-side primitives similar to rcu_read_lock()
358 * and rcu_read_unlock() because this implementation is intended to get
359 * the system into a safe state for some of the manipulations involved in
360 * tracing and the like. Finally, this implementation does not support
361 * high call_rcu_tasks() rates from multiple CPUs. If this is required,
362 * per-CPU callback lists will be needed.
363 */
364
365/* Global list of callbacks and associated lock. */
366static struct rcu_head *rcu_tasks_cbs_head;
367static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
368static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
369
370/* Post an RCU-tasks callback. */
371void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
372{
373 unsigned long flags;
374
375 rhp->next = NULL;
376 rhp->func = func;
377 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
378 *rcu_tasks_cbs_tail = rhp;
379 rcu_tasks_cbs_tail = &rhp->next;
380 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
381}
382EXPORT_SYMBOL_GPL(call_rcu_tasks);
383
384/* See if the current task has stopped holding out, remove from list if so. */
385static void check_holdout_task(struct task_struct *t)
386{
387 if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
388 t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
389 !ACCESS_ONCE(t->on_rq)) {
390 ACCESS_ONCE(t->rcu_tasks_holdout) = false;
391 list_del_rcu(&t->rcu_tasks_holdout_list);
392 put_task_struct(t);
393 }
394}
395
396/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
397static int __noreturn rcu_tasks_kthread(void *arg)
398{
399 unsigned long flags;
400 struct task_struct *g, *t;
401 struct rcu_head *list;
402 struct rcu_head *next;
403 LIST_HEAD(rcu_tasks_holdouts);
404
405 /* FIXME: Add housekeeping affinity. */
406
407 /*
408 * Each pass through the following loop makes one check for
409 * newly arrived callbacks, and, if there are some, waits for
410 * one RCU-tasks grace period and then invokes the callbacks.
411 * This loop is terminated by the system going down. ;-)
412 */
413 for (;;) {
414
415 /* Pick up any new callbacks. */
416 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
417 list = rcu_tasks_cbs_head;
418 rcu_tasks_cbs_head = NULL;
419 rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
420 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
421
422 /* If there were none, wait a bit and start over. */
423 if (!list) {
424 schedule_timeout_interruptible(HZ);
425 WARN_ON(signal_pending(current));
426 continue;
427 }
428
429 /*
430 * Wait for all pre-existing t->on_rq and t->nvcsw
431 * transitions to complete. Invoking synchronize_sched()
432 * suffices because all these transitions occur with
433 * interrupts disabled. Without this synchronize_sched(),
434 * a read-side critical section that started before the
435 * grace period might be incorrectly seen as having started
436 * after the grace period.
437 *
438 * This synchronize_sched() also dispenses with the
439 * need for a memory barrier on the first store to
440 * ->rcu_tasks_holdout, as it forces the store to happen
441 * after the beginning of the grace period.
442 */
443 synchronize_sched();
444
445 /*
446 * There were callbacks, so we need to wait for an
447 * RCU-tasks grace period. Start off by scanning
448 * the task list for tasks that are not already
449 * voluntarily blocked. Mark these tasks and make
450 * a list of them in rcu_tasks_holdouts.
451 */
452 rcu_read_lock();
453 for_each_process_thread(g, t) {
454 if (t != current && ACCESS_ONCE(t->on_rq) &&
455 !is_idle_task(t)) {
456 get_task_struct(t);
457 t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
458 ACCESS_ONCE(t->rcu_tasks_holdout) = true;
459 list_add(&t->rcu_tasks_holdout_list,
460 &rcu_tasks_holdouts);
461 }
462 }
463 rcu_read_unlock();
464
465 /*
466 * Each pass through the following loop scans the list
467 * of holdout tasks, removing any that are no longer
468 * holdouts. When the list is empty, we are done.
469 */
470 while (!list_empty(&rcu_tasks_holdouts)) {
471 schedule_timeout_interruptible(HZ);
472 WARN_ON(signal_pending(current));
473 rcu_read_lock();
474 list_for_each_entry_rcu(t, &rcu_tasks_holdouts,
475 rcu_tasks_holdout_list)
476 check_holdout_task(t);
477 rcu_read_unlock();
478 }
479
480 /*
481 * Because ->on_rq and ->nvcsw are not guaranteed
482 * to have a full memory barriers prior to them in the
483 * schedule() path, memory reordering on other CPUs could
484 * cause their RCU-tasks read-side critical sections to
485 * extend past the end of the grace period. However,
486 * because these ->nvcsw updates are carried out with
487 * interrupts disabled, we can use synchronize_sched()
488 * to force the needed ordering on all such CPUs.
489 *
490 * This synchronize_sched() also confines all
491 * ->rcu_tasks_holdout accesses to be within the grace
492 * period, avoiding the need for memory barriers for
493 * ->rcu_tasks_holdout accesses.
494 */
495 synchronize_sched();
496
497 /* Invoke the callbacks. */
498 while (list) {
499 next = list->next;
500 local_bh_disable();
501 list->func(list);
502 local_bh_enable();
503 list = next;
504 cond_resched();
505 }
506 }
507}
508
509/* Spawn rcu_tasks_kthread() at boot time. */
510static int __init rcu_spawn_tasks_kthread(void)
511{
512 struct task_struct __maybe_unused *t;
513
514 t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
515 BUG_ON(IS_ERR(t));
516 return 0;
517}
518early_initcall(rcu_spawn_tasks_kthread);
519
520#endif /* #ifdef CONFIG_TASKS_RCU */