From fc2219d49ef1606e7fd2c88af2b423b01ff3d319 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 09:50:41 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett These issues identified during an old-fashioned face-to-face code review extended over many hours. o Bury various forms of the "rsp->completed == rsp->gpnum" comparison into an rcu_gp_in_progress() function, which has the beneficial side-effect of forcing consistent use of ACCESS_ONCE(). o Replace hand-coded arithmetic with DIV_ROUND_UP(). o Bury several "!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x01])" instances into an rcu_preempted_readers() function, as this expression indicates that there are no readers blocked within RCU read-side critical sections blocking the current grace period. (Though there might well be similar readers blocking the next grace period.) o Remove a dangling rcu_restart_cpu() declaration that has been dangling for almost 20 minor releases of the kernel. Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12537246442687-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 29 +++++++++++++++++----------- kernel/rcutree.h | 6 +++--- kernel/rcutree_plugin.h | 50 ++++++++++++++++++++++++------------------------- 3 files changed, 46 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 52b06f6e158c..f85b6842d1e1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -100,6 +100,16 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, #include "rcutree_plugin.h" +/* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} + /* * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least @@ -173,9 +183,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - /* ACCESS_ONCE() because we are accessing outside of lock. */ - return *rdp->nxttail[RCU_DONE_TAIL] && - ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum); + return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); } /* @@ -482,7 +490,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -537,8 +545,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rsp->gpnum != rsp->completed && - delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { /* They had two time units to dump stack, so complain. */ print_other_cpu_stall(rsp); @@ -703,9 +710,9 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) * hold rnp->lock, as required by rcu_start_gp(), which will release it. */ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) - __releases(rnp->lock) + __releases(rcu_get_root(rsp)->lock) { - WARN_ON_ONCE(rsp->completed == rsp->gpnum); + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -1092,7 +1099,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) + if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ @@ -1251,7 +1258,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; /* Start a new grace period if one not already started. */ - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { + if (!rcu_gp_in_progress(rsp)) { unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -1331,7 +1338,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && + if (rcu_gp_in_progress(rsp) && ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { rdp->n_rp_need_fqs++; return 1; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8e8287a983c2..9aa8c8a160d8 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -48,14 +48,14 @@ #elif NR_CPUS <= RCU_FANOUT_SQ # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 #elif NR_CPUS <= RCU_FANOUT_CUBE # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 NR_CPUS #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 1cee04f627eb..8ff1ba7b3c43 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -150,6 +150,16 @@ void __rcu_read_lock(void) } EXPORT_SYMBOL_GPL(__rcu_read_lock); +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); +} + static void rcu_read_unlock_special(struct task_struct *t) { int empty; @@ -196,7 +206,7 @@ static void rcu_read_unlock_special(struct task_struct *t) break; spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + empty = !rcu_preempted_readers(rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; @@ -207,7 +217,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * drop rnp->lock and restore irq. */ if (!empty && rnp->qsmask == 0 && - list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { + !rcu_preempted_readers(rnp)) { struct rcu_node *rnp_p; if (rnp->parent == NULL) { @@ -257,12 +267,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { unsigned long flags; struct list_head *lp; - int phase = rnp->gpnum & 0x1; + int phase; struct task_struct *t; - if (!list_empty(&rnp->blocked_tasks[phase])) { + if (rcu_preempted_readers(rnp)) { spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; /* re-read under lock. */ + phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); @@ -281,20 +291,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rcu_preempted_readers(rnp)); WARN_ON_ONCE(rnp->qsmask); } -/* - * Check for preempted RCU readers for the specified rcu_node structure. - * If the caller needs a reliable answer, it must hold the rcu_node's - * >lock. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); -} - #ifdef CONFIG_HOTPLUG_CPU /* @@ -461,6 +461,15 @@ static void rcu_preempt_note_context_switch(int cpu) { } +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return 0; +} + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR /* @@ -483,15 +492,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -/* - * Because preemptable RCU does not exist, there are never any preempted - * RCU readers. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU /* -- cgit v1.2.2 From 1eba8f84380bede3c602bd7758dea96925cead01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 09:50:42 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett, part 2 These issues identified during an old-fashioned face-to-face code review extending over many hours. o Add comments for tricky parts of code, and correct comments that have passed their sell-by date. o Get rid of the vestiges of rcu_init_sched(), which is no longer needed now that PREEMPT_RCU is gone. o Move the #include of rcutree_plugin.h to the end of rcutree.c, which means that, rather than having a random collection of forward declarations, the new set of forward declarations document the set of plugins. The new home for this #include also allows __rcu_init_preempt() to move into rcutree_plugin.h. o Fix rcu_preempt_check_callbacks() to be static. Suggested-by: Josh Triplett Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12537246443924-git-send-email-> Signed-off-by: Ingo Molnar Peter Zijlstra --- kernel/rcutree.c | 72 +++++++++++++++++++++++-------------------------- kernel/rcutree.h | 31 +++++++++++++++------ kernel/rcutree_plugin.h | 23 ++++++++++++++-- 3 files changed, 77 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f85b6842d1e1..53a5ef0ca911 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -81,24 +81,29 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -extern long rcu_batches_completed_sched(void); -static struct rcu_node *rcu_get_root(struct rcu_state *rsp); -static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags); -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); +/* Forward declarations for rcutree_plugin.h */ +static inline void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void __rcu_process_callbacks(struct rcu_state *rsp, - struct rcu_data *rdp); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp); -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, - int preemptable); +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void __init __rcu_init_preempt(void); -#include "rcutree_plugin.h" /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s @@ -377,7 +382,7 @@ static long dyntick_recall_completed(struct rcu_state *rsp) /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU - * is already in a quiescent state courtesy of dynticks idle mode. + * is in dynticks idle mode, which is an extended quiescent state. */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { @@ -624,9 +629,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) note_new_gpnum(rsp, rdp); /* - * Because we are first, we know that all our callbacks will - * be covered by this upcoming grace period, even the ones - * that were registered arbitrarily recently. + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. */ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; @@ -886,7 +897,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) /* * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiscent, so it is now + * Note that the outgoing CPU is now quiescent, so it is now * (uncharacteristically) safe to access its rcu_data structure. * Note also that we must carefully retain the order of the * outgoing CPU's callbacks in order for rcu_barrier() to work @@ -1577,25 +1588,6 @@ do { \ } \ } while (0) -#ifdef CONFIG_TREE_PREEMPT_RCU - -void __init __rcu_init_preempt(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init_preempt(void) -{ -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - void __init __rcu_init(void) { int i; /* All used by RCU_INIT_FLAVOR(). */ @@ -1612,6 +1604,8 @@ void __init __rcu_init(void) open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } +#include "rcutree_plugin.h" + module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 9aa8c8a160d8..a48d11f37b4c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -79,15 +79,21 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; + spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ unsigned long qsmaskinit; /* Per-GP initialization for qsmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ @@ -95,6 +101,9 @@ struct rcu_node { struct rcu_node *parent; struct list_head blocked_tasks[2]; /* Tasks blocked in RCU read-side critsect. */ + /* Grace period number (->gpnum) x blocked */ + /* by tasks on the (x & 0x1) element of the */ + /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; /* Index values for nxttail array in struct rcu_data. */ @@ -126,19 +135,22 @@ struct rcu_data { * Any of the partitions might be empty, in which case the * pointer to that partition will be equal to the pointer for * the following partition. When the list is empty, all of - * the nxttail elements point to nxtlist, which is NULL. + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. * - * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP * [nxtlist, *nxttail[RCU_DONE_TAIL]): * Entries that batch # <= ->completed * The grace period for these entries has completed, and * the other grace-period-completed entries may be moved * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; @@ -216,6 +228,9 @@ struct rcu_state { /* Force QS state. */ long gpnum; /* Current gp number. */ long completed; /* # of last completed gp. */ + + /* End of fields guarded by root rcu_node's lock. */ + spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. */ spinlock_t fqslock; /* Only one task forcing */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8ff1ba7b3c43..65250219ab6d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -418,6 +418,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); } +/* + * Initialize preemptable RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + int i; /* All used by RCU_INIT_FLAVOR(). */ + int j; + struct rcu_node *rnp; + + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + /* * Check for a task exiting while in a preemptable-RCU read-side * critical section, clean up if so. No need to issue warnings, @@ -518,7 +530,7 @@ static void rcu_preempt_offline_cpu(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to check. */ -void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(int cpu) { } @@ -526,7 +538,7 @@ void rcu_preempt_check_callbacks(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to process. */ -void rcu_preempt_process_callbacks(void) +static void rcu_preempt_process_callbacks(void) { } @@ -563,4 +575,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { } +/* + * Because preemptable RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ -- cgit v1.2.2 From 9b2619aff0332e95ea5eb7a0d75b0208818d871c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 09:50:43 -0700 Subject: rcu: Clean up code to address Ingo's checkpatch feedback Move declarations and update storage classes to make checkpatch happy. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12537246441701-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 3 --- kernel/rcutorture.c | 4 +--- kernel/rcutree.c | 23 ----------------------- kernel/rcutree.h | 26 +++++++++++++++++++++++++- kernel/rcutree_trace.c | 12 ++++++------ 5 files changed, 32 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 37ac45483082..8e795133b33d 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -259,9 +259,6 @@ static void rcu_migrate_callback(struct rcu_head *notused) wake_up(&rcu_migrate_wq); } -extern int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); - static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 233768f21f97..697c0a0229d4 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -606,8 +606,6 @@ static struct rcu_torture_ops sched_ops_sync = { .name = "sched_sync" }; -extern int rcu_expedited_torture_stats(char *page); - static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, @@ -650,7 +648,7 @@ rcu_torture_writer(void *arg) old_rp = rcu_torture_current; rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53a5ef0ca911..8e52cde7b8f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -81,29 +81,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -/* Forward declarations for rcutree_plugin.h */ -static inline void rcu_bootup_announce(void); -long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); -static int rcu_preempted_readers(struct rcu_node *rnp); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -static void rcu_print_task_stall(struct rcu_node *rnp); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -static void rcu_preempt_offline_cpu(int cpu); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); -static void rcu_preempt_process_callbacks(void); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -static int rcu_preempt_pending(int cpu); -static int rcu_preempt_needs_cpu(int cpu); -static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void __init __rcu_init_preempt(void); - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a48d11f37b4c..e6ab31cc28ba 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -270,5 +270,29 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#endif /* #ifdef RCU_TREE_NONCORE */ +#else /* #ifdef RCU_TREE_NONCORE */ +/* Forward declarations for rcutree_plugin.h */ +static inline void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void __init __rcu_init_preempt(void); + +#endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c89f5e9fd173..f09af28b8262 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata, NULL); } -static struct file_operations rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, @@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata_csv, NULL); } -static struct file_operations rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, .open = rcudata_csv_open, .read = seq_read, @@ -159,7 +159,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", rsp->completed, rsp->gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), @@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file) return single_open(file, show_rcuhier, NULL); } -static struct file_operations rcuhier_fops = { +static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, @@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file) return single_open(file, show_rcugp, NULL); } -static struct file_operations rcugp_fops = { +static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, @@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file) return single_open(file, show_rcu_pending, NULL); } -static struct file_operations rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, -- cgit v1.2.2 From d3f6302e7e51b41af86c6496ffb2f95e8f2179df Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 24 Sep 2009 14:07:55 -0700 Subject: hrtimer: Remove overly verbose "switch to high res mode" message On big systems, printing copies of Switched to high resolution mode on CPU nnn clutters up the kernel log for minimal gain. Just get rid of them. Signed-off-by: Roland Dreier LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e5d98ce50f89..42672790ee13 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -720,8 +720,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } -- cgit v1.2.2 From 8c9ed8e14c342ec5e7f27e7e498f62409a10eb29 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 25 Sep 2009 13:51:17 +0800 Subject: perf_event: Fix event group handling in __perf_event_sched_*() Paul Mackerras says: "Actually, looking at this more closely, it has to be a group leader anyway since it's at the top level of ctx->group_list. In fact I see four places where we do: list_for_each_entry(event, &ctx->group_list, group_entry) { if (event == event->group_leader) ... or the equivalent, three of which appear to have been introduced by afedadf2 ("perf_counter: Optimize sched in/out of counters") back in May by Peter Z. As far as I can see the if () is superfluous in each case (a singleton event will be a group of 1 and will have its group_leader pointing to itself)." [ See: http://marc.info/?l=linux-kernel&m=125361238901442&w=2 ] And Peter Zijlstra points out this is a bugfix: "The intent was to call event_sched_{in,out}() for single event groups because that's cheaper than group_sched_{in,out}(), however.. - as you noticed, I got the condition wrong, it should have read: list_empty(&event->sibling_list) - it failed to call group_can_go_on() which deals with ->exclusive. - it also doesn't call hw_perf_group_sched_in() which might break power." [ See: http://marc.info/?l=linux-kernel&m=125369523318583&w=2 ] Changelog v1->v2: - Fix the title name according to Peter Zijlstra's suggestion - Remove the comments and WARN_ON_ONCE() as Peter Zijlstra's suggestion Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4ABC5A55.7000208@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 0f86feb6db0c..e50543db642a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1030,14 +1030,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event != event->group_leader) - event_sched_out(event, cpuctx, ctx); - else - group_sched_out(event, cpuctx, ctx); - } - } + if (ctx->nr_active) + list_for_each_entry(event, &ctx->group_list, group_entry) + group_sched_out(event, cpuctx, ctx); + perf_enable(); out: spin_unlock(&ctx->lock); @@ -1258,12 +1254,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) - event_sched_in(event, cpuctx, ctx, cpu); - else { - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); - } + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); /* * If this pinned group hasn't been scheduled, @@ -1291,15 +1283,9 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) { - if (event_sched_in(event, cpuctx, ctx, cpu)) + if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; - } else { - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } } perf_enable(); out: -- cgit v1.2.2 From 27f9994c50e95f3a5a81fe4c7491a9f9cffe6ec0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 25 Sep 2009 13:54:01 +0800 Subject: perf_event: Clean up perf_event_init_task() While at it: we can traverse ctx->group_list to get all group leader, it should be safe since we hold ctx->mutex. Changlog v1->v2: - remove WARN_ON_ONCE() according to Peter Zijlstra's suggestion Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4ABC5AF9.6060808@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e50543db642a..e491fb087939 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4767,9 +4767,7 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { - if (event != event->group_leader) - continue; + list_for_each_entry(event, &parent_ctx->group_list, group_entry) { if (!event->attr.inherit) { inherited_all = 0; -- cgit v1.2.2 From f9ac5a69edee0ee7e06a05727226e3f275306c8d Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 28 Sep 2009 16:55:40 +0900 Subject: kmemtrace: Fix up tracer registration Commit ddc1637af217dbd8bc51f30e6d24e84476a869a6 ("kmemtrace: Print binary output only if 'bin' option is set") ended up inverting the error detection logic. register_tracer() returns 0 on success, which this change caused to treat as an error, resulting in: [ 0.132000] Warning: could not register the kmem tracer as well as bailing out of the initcall with an error value. This restores the old logic. Signed-off-by: Paul Mundt Acked-by: Pekka Enberg Acked-by: Frederic Weisbecker Cc: Eduard - Gabriel Munteanu Cc: Steven Rostedt Cc: Li Zefan LKML-Reference: <20090928075540.GD6668@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 81b1645c8549..a91da69f153a 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void) return 1; } - if (!register_tracer(&kmem_tracer)) { + if (register_tracer(&kmem_tracer) != 0) { pr_warning("Warning: could not register the kmem tracer\n"); return 1; } -- cgit v1.2.2 From 48c0d4d4c04dd520c55e0fd756fa4e7c83de3d13 Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Fri, 25 Sep 2009 06:19:26 +0200 Subject: Add missing blk_trace_remove_sysfs to be in pair with blk_trace_init_sysfs Add missing blk_trace_remove_sysfs to be in pair with blk_trace_init_sysfs introduced in commit 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82. Release kobject also in case the request_fn is NULL. Problem was noticed via kmemleak backtrace when some sysfs entries were note properly destroyed during device removal: unreferenced object 0xffff88001aa76640 (size 80): comm "lvcreate", pid 2120, jiffies 4294885144 hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 f0 65 a7 1a 00 88 ff ff .........e...... 90 66 a7 1a 00 88 ff ff 86 1d 53 81 ff ff ff ff .f........S..... backtrace: [] kmemleak_alloc+0x26/0x60 [] kmem_cache_alloc+0x133/0x1c0 [] sysfs_new_dirent+0x41/0x120 [] sysfs_add_file_mode+0x3c/0xb0 [] internal_create_group+0xc1/0x1a0 [] sysfs_create_group+0x13/0x20 [] blk_trace_init_sysfs+0x14/0x20 [] blk_register_queue+0x3c/0xf0 [] add_disk+0x94/0x160 [] dm_create+0x598/0x6e0 [dm_mod] [] dev_create+0x51/0x350 [dm_mod] [] ctl_ioctl+0x1a3/0x240 [dm_mod] [] dm_compat_ctl_ioctl+0x12/0x20 [dm_mod] [] compat_sys_ioctl+0xcd/0x4f0 [] sysenter_dispatch+0x7/0x2c [] 0xffffffffffffffff Signed-off-by: Zdenek Kabelac Reviewed-by: Li Zefan Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3eb159c277c8..60b5c5a3d4b4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1657,6 +1657,11 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING -- cgit v1.2.2 From b0da3f0dada78832c9da03ad2152ae76bd9a2496 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Thu, 1 Oct 2009 21:16:13 +0200 Subject: Add a tracepoint for block request remapping Since 2.6.31 now has request-based device-mapper, it's useful to have a tracepoint for request-remapping as well as bio-remapping. This patch adds a tracepoint for request-remapping, trace_block_rq_remap(). Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Alasdair G Kergon Cc: Li Zefan Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 60b5c5a3d4b4..d9d6206e0b14 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -855,6 +855,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, sizeof(r), &r); } +/** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + /** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); -- cgit v1.2.2 From f83f9ac2632732bd1678150b5a03d152f912fe72 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Thu, 24 Sep 2009 06:47:10 +0000 Subject: sched: Set correct normal_prio and prio values in sched_fork() normal_prio should be updated if policy changes from RT to SCHED_MORMAL or if static_prio/nice is changed. Some paths through sched_fork() ignore this requirement and may result in normal_prio having an invalid value. Fixing this issue allows the call to effective_prio() in wake_up_new_task() to be removed. Signed-off-by: Peter Williams Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1535f3884b88..76c0e9691fc0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2514,23 +2514,18 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; - - if (p->normal_prio < DEFAULT_PRIO) - p->prio = DEFAULT_PRIO; + p->normal_prio = p->static_prio; + } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; set_load_weight(p); } @@ -2541,6 +2536,11 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; @@ -2581,8 +2581,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { -- cgit v1.2.2 From 162cc2794df37662beb7f97ddd1dd5bffaf85e9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 16:18:13 -0700 Subject: rcu: Fix rcu_lock_map build failure on CONFIG_PROVE_LOCKING=y Move the rcu_lock_map definition from rcutree.c to rcupdate.c so that TINY_RCU can use lockdep. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 7 +++++++ kernel/rcutree.c | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 8e795133b33d..4a189ea18b48 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -46,6 +46,13 @@ #include #include +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); +#endif + enum rcu_barrier { RCU_BARRIER_STD, RCU_BARRIER_BH, diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8e52cde7b8f7..81af59b8dd88 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -49,13 +49,6 @@ #include "rcutree.h" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - /* Data structures. */ #define RCU_STATE_INITIALIZER(name) { \ -- cgit v1.2.2 From 3d76c082907e8f83c5d5c4572f38d53ad8f00c4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2009 07:46:32 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett, part 3 Whitespace fixes, updated comments, and trivial code movement. o Fix whitespace error in RCU_HEAD_INIT() o Move "So where is rcu_write_lock()" comment so that it does not come between the rcu_read_unlock() header comment and the rcu_read_unlock() definition. o Move the module_param statements for blimit, qhimark, and qlowmark to immediately follow the corresponding definitions. o In __rcu_offline_cpu(), move the assignment to rdp_me inside the "if" statement, given that rdp_me is not used outside of that "if" statement. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12541491931164-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 81af59b8dd88..d5597830faf5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -122,6 +122,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -878,8 +882,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) * indefinitely delay callbacks, you have far worse things to * be worrying about. */ - rdp_me = rsp->rda[smp_processor_id()]; if (rdp->nxtlist != NULL) { + rdp_me = rsp->rda[smp_processor_id()]; *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtlist = NULL; @@ -1575,7 +1579,3 @@ void __init __rcu_init(void) } #include "rcutree_plugin.h" - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -- cgit v1.2.2 From a0b6c9a78c41dc36732d6e1e90f0f2f57b29816f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2009 07:46:33 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett, part 4 These issues identified during an old-fashioned face-to-face code review extending over many hours. This group improves an existing abstraction and introduces two new ones. It also fixes an RCU stall-warning bug found while making the other changes. o Make RCU_INIT_FLAVOR() declare its own variables, removing the need to declare them at each call site. o Create an rcu_for_each_leaf() macro that scans the leaf nodes of the rcu_node tree. o Create an rcu_for_each_node_breadth_first() macro that does a breadth-first traversal of the rcu_node tree, AKA stepping through the array in index-number order. o If all CPUs corresponding to a given leaf rcu_node structure go offline, then any tasks queued on that leaf will be moved to the root rcu_node structure. Therefore, the stall-warning code must dump out tasks queued on the root rcu_node structure as well as those queued on the leaf rcu_node structures. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12541491934126-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 53 ++++++++++++++++++++++++++----------------------- kernel/rcutree.h | 12 +++++++++++ kernel/rcutree_plugin.h | 4 ---- 3 files changed, 40 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5597830faf5..e2e272b5c277 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -462,8 +462,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) long delta; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; /* Only let one CPU complain about others per time interval. */ @@ -474,18 +472,24 @@ static void print_other_cpu_stall(struct rcu_state *rsp) return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { rcu_print_task_stall(rnp); - if (rnp_cur->qsmask == 0) + if (rnp->qsmask == 0) continue; - for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) - if (rnp_cur->qsmask & (1UL << cpu)) - printk(" %d", rnp_cur->grplo + cpu); + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); @@ -649,7 +653,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * one corresponding to this CPU, due to the fact that we have * irqs disabled. */ - for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { + rcu_for_each_node_breadth_first(rsp, rnp) { spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1042,33 +1046,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int cpu; unsigned long flags; unsigned long mask; - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + struct rcu_node *rnp; - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp_cur->lock, flags); + spin_lock_irqsave(&rnp->lock, flags); if (rsp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); return 1; } - if (rnp_cur->qsmask == 0) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + if (rnp->qsmask == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); continue; } - cpu = rnp_cur->grplo; + cpu = rnp->grplo; bit = 1; - for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { - if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } if (mask != 0 && rsp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp_cur->lock. */ - cpu_quiet_msk(mask, rsp, rnp_cur, flags); + /* cpu_quiet_msk() releases rnp->lock. */ + cpu_quiet_msk(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1550,6 +1553,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + int i; \ + int j; \ + struct rcu_node *rnp; \ + \ rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ @@ -1564,10 +1571,6 @@ do { \ void __init __rcu_init(void) { - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e6ab31cc28ba..676eecd371d9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -106,6 +106,18 @@ struct rcu_node { /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 65250219ab6d..57200fe96d0a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -423,10 +423,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) */ static void __init __rcu_init_preempt(void) { - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); } -- cgit v1.2.2 From 135c8aea557cf53abe6c8847e286d01442124193 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2009 21:50:21 -0700 Subject: rcu: Replace the rcu_barrier enum with pointer to call_rcu*() function The rcu_barrier enum causes several problems: (1) you have to define the enum somewhere, and there is no convenient place, (2) the difference between TREE_RCU and TREE_PREEMPT_RCU causes problems when you need to map from rcu_barrier enum to struct rcu_state, (3) the switch statement are large, and (4) TINY_RCU really needs a different rcu_barrier() than do the treercu implementations. So replace it with a functionally equivalent but cleaner function pointer abstraction. Signed-off-by: Paul E. McKenney Acked-by: Mathieu Desnoyers Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12541998232366-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4a189ea18b48..e43242274466 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,12 +53,6 @@ struct lockdep_map rcu_lock_map = EXPORT_SYMBOL_GPL(rcu_lock_map); #endif -enum rcu_barrier { - RCU_BARRIER_STD, - RCU_BARRIER_BH, - RCU_BARRIER_SCHED, -}; - static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -184,19 +178,12 @@ static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); atomic_inc(&rcu_barrier_cpu_count); - switch ((enum rcu_barrier)type) { - case RCU_BARRIER_STD: - call_rcu(head, rcu_barrier_callback); - break; - case RCU_BARRIER_BH: - call_rcu_bh(head, rcu_barrier_callback); - break; - case RCU_BARRIER_SCHED: - call_rcu_sched(head, rcu_barrier_callback); - break; - } + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); } static inline void wait_migrated_callbacks(void) @@ -209,7 +196,8 @@ static inline void wait_migrated_callbacks(void) * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(enum rcu_barrier type) +static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ @@ -225,7 +213,7 @@ static void _rcu_barrier(enum rcu_barrier type) * early. */ atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)type, 1); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); @@ -238,7 +226,7 @@ static void _rcu_barrier(enum rcu_barrier type) */ void rcu_barrier(void) { - _rcu_barrier(RCU_BARRIER_STD); + _rcu_barrier(call_rcu); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -247,7 +235,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ void rcu_barrier_bh(void) { - _rcu_barrier(RCU_BARRIER_BH); + _rcu_barrier(call_rcu_bh); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -256,7 +244,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(RCU_BARRIER_SCHED); + _rcu_barrier(call_rcu_sched); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -- cgit v1.2.2 From d014e8894dfc523dd9d2f2a17b6dcb94facea810 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 2 Oct 2009 14:41:20 +0300 Subject: panic: Fix panic message visibility by calling bust_spinlocks(0) before dying Commit ffd71da4e3f ("panic: decrease oops_in_progress only after having done the panic") moved bust_spinlocks(0) to the end of the function, which in practice is never reached. As a result console_unblank() is not called, and on some systems the user may not see the panic message. Move it back up to before the unblanking. Signed-off-by: Aaro Koskinen Reviewed-by: Frederic Weisbecker LKML-Reference: <1254483680-25578-1-git-send-email-aaro.koskinen@nokia.com> Signed-off-by: Ingo Molnar --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73b0ca3..bc4dcb6a389b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + bust_spinlocks(0); + if (!panic_blink) panic_blink = no_blink; @@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - bust_spinlocks(0); } EXPORT_SYMBOL(panic); -- cgit v1.2.2 From eaaea8036d0261d87d7072c5bc88c7ea730c18ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 4 Oct 2009 09:34:17 +0200 Subject: futex: Fix locking imbalance Rich reported a lock imbalance in the futex code: http://bugzilla.kernel.org/show_bug.cgi?id=14288 It's caused by the displacement of the retry_private label in futex_wake_op(). The code unlocks the hash bucket locks in the error handling path and retries without locking them again which makes the next unlock fail. Move retry_private so we lock the hash bucket locks when we retry. Reported-by: Rich Ercolany Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart Cc: stable-2.6.31 LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 463af2efa512..1e176f3ab26c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,8 +916,8 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { -- cgit v1.2.2 From ee949a86b3aef15845ea677aa60231008de62672 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 6 Oct 2009 01:00:49 -0500 Subject: tracing/syscalls: Use long for syscall ret format and field definitions The syscall event definitions use long for the syscall exit ret value, but unsigned long for the same thing in the format and field definitions. Change them all to long. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: lizf@cn.fujitsu.com Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1254808849-7829-4-git-send-email-tzanussi@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9fbce6c9d2e1..527e17eae575 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -166,7 +166,7 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(unsigned long, ret)); + SYSCALL_FIELD(long, ret)); if (!ret) return 0; @@ -212,7 +212,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, FILTER_OTHER); return ret; -- cgit v1.2.2 From 906010b2134e14a2e377decbadd357b3d0ab9c6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Sep 2009 16:08:49 +0200 Subject: perf_event: Provide vmalloc() based mmap() backing Some architectures such as Sparc, ARM and MIPS (basically everything with flush_dcache_page()) need to deal with dcache aliases by carefully placing pages in both kernel and user maps. These architectures typically have to use vmalloc_user() for this. However, on other architectures, vmalloc() is not needed and has the downsides of being more restricted and slower than regular allocations. Signed-off-by: Peter Zijlstra Acked-by: David Miller Cc: Andrew Morton Cc: Jens Axboe Cc: Paul Mackerras LKML-Reference: <1254830228.21044.272.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 248 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 186 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e491fb087939..9d0b5c665883 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -2091,49 +2092,31 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static unsigned long perf_data_size(struct perf_mmap_data *data) { - struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; + return data->nr_pages << (PAGE_SHIFT + data->data_order); +} - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; +#ifndef CONFIG_PERF_USE_VMALLOC - vmf->page = virt_to_page(data->data_pages[nr]); - } +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > data->nr_pages) + return NULL; - ret = 0; -unlock: - rcu_read_unlock(); + if (pgoff == 0) + return virt_to_page(data->user_page); - return ret; + return virt_to_page(data->data_pages[pgoff - 1]); } -static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { struct perf_mmap_data *data; unsigned long size; @@ -2158,19 +2141,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) goto fail_data_pages; } + data->data_order = 0; data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - if (event->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - event->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - rcu_assign_pointer(event->data, data); - - return 0; + return data; fail_data_pages: for (i--; i >= 0; i--) @@ -2182,7 +2156,7 @@ fail_user_page: kfree(data); fail: - return -ENOMEM; + return NULL; } static void perf_mmap_free_page(unsigned long addr) @@ -2193,28 +2167,169 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void __perf_mmap_data_free(struct rcu_head *rcu_head) +static void perf_mmap_data_free(struct perf_mmap_data *data) { - struct perf_mmap_data *data; int i; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > (1UL << data->data_order)) + return NULL; + + return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_mmap_data_free_work(struct work_struct *work) +{ + struct perf_mmap_data *data; + void *base; + int i, nr; + + data = container_of(work, struct perf_mmap_data, work); + nr = 1 << data->data_order; + + base = data->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + schedule_work(&data->work); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + void *all_buf; + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + INIT_WORK(&data->work, perf_mmap_data_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + data->user_page = all_buf; + data->data_pages[0] = all_buf + PAGE_SIZE; + data->data_order = ilog2(nr_pages); + data->nr_pages = 1; + + return data; + +fail_all_buf: + kfree(data); + +fail: + return NULL; +} + +#endif + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(data, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void +perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +{ + long max_size = perf_data_size(data); + + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, max_size, + event->attr.wakeup_watermark); + } + + if (!data->watermark) + data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + + + rcu_assign_pointer(event->data, data); +} + +static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + perf_mmap_data_free(data); kfree(data); } -static void perf_mmap_data_free(struct perf_event *event) +static void perf_mmap_data_release(struct perf_event *event) { struct perf_mmap_data *data = event->data; WARN_ON(atomic_read(&event->mmap_count)); rcu_assign_pointer(event->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); + call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2230,11 +2345,12 @@ static void perf_mmap_close(struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->data); struct user_struct *user = current_user(); - atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_free(event); + perf_mmap_data_release(event); mutex_unlock(&event->mmap_mutex); } } @@ -2252,6 +2368,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; + struct perf_mmap_data *data; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2314,10 +2431,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } WARN_ON(event->data); - ret = perf_mmap_data_alloc(event, nr_pages); - if (ret) + + data = perf_mmap_data_alloc(event, nr_pages); + ret = -ENOMEM; + if (!data) goto unlock; + ret = 0; + perf_mmap_data_init(event, data); + atomic_set(&event->mmap_count, 1); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; @@ -2505,7 +2627,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, if (!data->writable) return true; - mask = (data->nr_pages << PAGE_SHIFT) - 1; + mask = perf_data_size(data) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2610,7 +2732,7 @@ void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { unsigned int pages_mask; - unsigned int offset; + unsigned long offset; unsigned int size; void **pages; @@ -2619,12 +2741,14 @@ void perf_output_copy(struct perf_output_handle *handle, pages = handle->data->data_pages; do { - unsigned int page_offset; + unsigned long page_offset; + unsigned long page_size; int nr; nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); + page_offset = offset & (page_size - 1); + size = min_t(unsigned int, page_size - page_offset, len); memcpy(pages[nr] + page_offset, buf, size); -- cgit v1.2.2 From fc6b177dee33365ccb29fe6d2092223cf8d679f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Oct 2009 18:17:32 +0200 Subject: futex: Nullify robust lists after cleanup The robust list pointers of user space held futexes are kept intact over an exec() call. When the exec'ed task exits exit_robust_list() is called with the stale pointer. The risk of corruption is minimal, but still it is incorrect to keep the pointers valid. Actually glibc should uninstall the robust list before calling exec() but we have to deal with it anyway. Nullify the pointers after [compat_]exit_robust_list() has been called. Reported-by: Anirban Sinha Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: Cc: stable@kernel.org --- kernel/fork.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bfee931ee3fb..88ef51ca84de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -543,11 +543,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) + if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); + tsk->robust_list = NULL; + } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) + if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } #endif #endif -- cgit v1.2.2 From 322a2c100a8998158445599ea437fb556aa95b11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Oct 2009 18:18:03 +0200 Subject: futex: Move exit_pi_state() call to release_mm() exit_pi_state() is called from do_exit() but not from do_execve(). Move it to release_mm() so it gets called from do_execve() as well. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: stable@kernel.org Cc: Anirban Sinha Cc: Peter Zijlstra --- kernel/exit.c | 2 -- kernel/fork.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ae5d8660ddff..bc2b1fdfc354 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -989,8 +989,6 @@ NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif diff --git a/kernel/fork.c b/kernel/fork.c index 88ef51ca84de..341965b0ab1c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -553,6 +553,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->compat_robust_list = NULL; } #endif + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); #endif /* Get rid of any cached register state */ -- cgit v1.2.2 From d0ec774cb2599c858be9d923bb873cf6697520d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Oct 2009 21:48:16 -0700 Subject: rcu: Move rcu_barrier() to rcutree Move the existing rcu_barrier() implementation to rcutree.c, consistent with the fact that the rcu_barrier() implementation is tied quite tightly to the RCU implementation. This opens the way to simplify and fix rcutree.c's rcu_barrier() implementation in a later patch. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12548908982563-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 120 +----------------------------------------------------- kernel/rcutree.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index e43242274466..400183346ad2 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,16 +53,8 @@ struct lockdep_map rcu_lock_map = EXPORT_SYMBOL_GPL(rcu_lock_map); #endif -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; int rcu_scheduler_active __read_mostly; -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); - /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. @@ -165,120 +157,10 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - - atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); -} - -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) -{ - BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. - */ - atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(call_rcu); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(call_rcu_bh); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(call_rcu_sched); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { - rcu_cpu_notify(self, action, hcpu); - if (action == CPU_DYING) { - /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. - */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_DOWN_PREPARE) { - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - } - - return NOTIFY_OK; + return rcu_cpu_notify(self, action, hcpu); } void __init rcu_init(void) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e2e272b5c277..0108570a192c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1363,6 +1363,103 @@ int rcu_needs_cpu(int cpu) rcu_preempt_needs_cpu(cpu); } +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; +static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); +static struct rcu_head rcu_migrate_head[3]; +static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +static inline void wait_migrated_callbacks(void) +{ + wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); + smp_mb(); /* In case we didn't sleep. */ +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take cpucontrol mutex to protect against CPU hotplug */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); + wait_migrated_callbacks(); +} + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +static void rcu_migrate_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_migrate_type_count)) + wake_up(&rcu_migrate_wq); +} + /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -1459,6 +1556,28 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* Don't need to wait until next removal operation. */ + /* rcu_migrate_head is protected by cpu_add_remove_lock */ + wait_migrated_callbacks(); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * preempt_disable() in on_each_cpu() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(), + * and the dead cpu(if it exist) queues rcu_migrate_callback()s. + * + * These callbacks ensure _rcu_barrier() waits for all + * RCU callbacks of the specified type to complete. + */ + atomic_set(&rcu_migrate_type_count, 3); + call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); + call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); + call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: -- cgit v1.2.2 From e74f4c4564455c91a3b4075bb1721993c2a95dda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Oct 2009 21:48:17 -0700 Subject: rcu: Make hot-unplugged CPU relinquish its own RCU callbacks The current interaction between RCU and CPU hotplug requires that RCU block in CPU notifiers waiting for callbacks to drain. This can be greatly simplified by having each CPU relinquish its own callbacks, and for both _rcu_barrier() and CPU_DEAD notifiers to adopt all callbacks that were previously relinquished. This change also eliminates the possibility of certain types of hangs due to the previous practice of waiting for callbacks to be invoked from within CPU notifiers. If you don't every wait, you cannot hang. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1254890898456-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 151 +++++++++++++++++++++++++----------------------- kernel/rcutree.h | 11 +++- kernel/rcutree_plugin.h | 34 +++++++++++ kernel/rcutree_trace.c | 4 +- 4 files changed, 125 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0108570a192c..d8d98655c9e7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -63,6 +63,9 @@ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .orphan_cbs_list = NULL, \ + .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_qlen = 0, \ .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -837,18 +840,64 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU +/* + * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the + * specified flavor of RCU. The callbacks will be adopted by the next + * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever + * comes first. Because this is invoked from the CPU_DYING notifier, + * irqs are already disabled. + */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + *rsp->orphan_cbs_tail = rdp->nxtlist; + rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rsp->orphan_qlen += rdp->qlen; + rdp->qlen = 0; + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +} + +/* + * Adopt previously orphaned RCU callbacks. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + spin_lock_irqsave(&rsp->onofflock, flags); + rdp = rsp->rda[smp_processor_id()]; + if (rsp->orphan_cbs_list == NULL) { + spin_unlock_irqrestore(&rsp->onofflock, flags); + return; + } + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; + rdp->qlen += rsp->orphan_qlen; + rsp->orphan_cbs_list = NULL; + rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; + rsp->orphan_qlen = 0; + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + /* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { - int i; unsigned long flags; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; - struct rcu_data *rdp_me; struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -871,32 +920,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) } while (rnp != NULL); lastcomp = rsp->completed; - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + spin_unlock_irqrestore(&rsp->onofflock, flags); - /* - * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiescent, so it is now - * (uncharacteristically) safe to access its rcu_data structure. - * Note also that we must carefully retain the order of the - * outgoing CPU's callbacks in order for rcu_barrier() to work - * correctly. Finally, note that we start all the callbacks - * afresh, even those that have passed through a grace period - * and are therefore ready to invoke. The theory is that hotplug - * events are rare, and that if they are frequent enough to - * indefinitely delay callbacks, you have far worse things to - * be worrying about. - */ - if (rdp->nxtlist != NULL) { - rdp_me = rsp->rda[smp_processor_id()]; - *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp_me->qlen += rdp->qlen; - rdp->qlen = 0; - } - local_irq_restore(flags); + rcu_adopt_orphan_cbs(rsp); } /* @@ -914,6 +940,14 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ +} + +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_offline_cpu(int cpu) { } @@ -1367,9 +1401,6 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); static void rcu_barrier_callback(struct rcu_head *notused) { @@ -1392,21 +1423,16 @@ static void rcu_barrier_func(void *type) call_rcu_func(head, rcu_barrier_callback); } -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, void (*func)(struct rcu_head *head))) { BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ + /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); /* @@ -1419,29 +1445,22 @@ static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, * early. */ atomic_set(&rcu_barrier_cpu_count, 1); + preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ + rcu_adopt_orphan_cbs(rsp); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(call_rcu); } -EXPORT_SYMBOL_GPL(rcu_barrier); /** * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. */ void rcu_barrier_bh(void) { - _rcu_barrier(call_rcu_bh); + _rcu_barrier(&rcu_bh_state, call_rcu_bh); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -1450,16 +1469,10 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(call_rcu_sched); + _rcu_barrier(&rcu_sched_state, call_rcu_sched); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -1556,27 +1569,21 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - break; case CPU_DYING: case CPU_DYING_FROZEN: /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), + * preempt_disable() in _rcu_barrier() prevents stop_machine(), * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. + * returns, all online cpus have queued rcu_barrier_func(). + * The dying CPU clears its cpu_online_mask bit and + * moves all of its RCU callbacks to ->orphan_cbs_list + * in the context of stop_machine(), so subsequent calls + * to _rcu_barrier() will adopt these callbacks and only + * then queue rcu_barrier_func() on all remaining CPUs. */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); + rcu_send_cbs_to_orphanage(&rcu_bh_state); + rcu_send_cbs_to_orphanage(&rcu_sched_state); + rcu_preempt_send_cbs_to_orphanage(); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 676eecd371d9..b40ac5706040 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -244,7 +244,15 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ + /* starting new GP. Also */ + /* protects the following */ + /* orphan_cbs fields. */ + struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ + /* orphaned by all CPUs in */ + /* a given leaf rcu_node */ + /* going offline. */ + struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ + long orphan_qlen; /* Number of orphaned cbs. */ spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -305,6 +313,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void rcu_preempt_send_cbs_to_orphanage(void); static void __init __rcu_init_preempt(void); #endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 57200fe96d0a..c0cb783aa16a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -410,6 +410,15 @@ static int rcu_preempt_needs_cpu(int cpu) return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state, call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Initialize preemptable RCU's per-CPU data. */ @@ -418,6 +427,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); } +/* + * Move preemptable RCU's callbacks to ->orphan_cbs_list. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ + rcu_send_cbs_to_orphanage(&rcu_preempt_state); +} + /* * Initialize preemptable RCU's state structures. */ @@ -563,6 +580,16 @@ static int rcu_preempt_needs_cpu(int cpu) return 0; } +/* + * Because preemptable RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Because preemptable RCU does not exist, there is no per-CPU * data to initialize. @@ -571,6 +598,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { } +/* + * Because there is no preemptable RCU, there are no callbacks to move. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ +} + /* * Because preemptable RCU does not exist, it need not be initialized. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index f09af28b8262..4b31c779e62e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -159,13 +159,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", rsp->completed, rsp->gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->orphan_qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); -- cgit v1.2.2 From 978c0b88146a7f9b364b71b5b83c5b12e7b413d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Oct 2009 21:48:18 -0700 Subject: rcu: Place root rcu_node structure in separate lockdep class Before this patch, all of the rcu_node structures were in the same lockdep class, so that lockdep would complain when rcu_preempt_offline_tasks() acquired the root rcu_node structure's lock while holding one of the leaf rcu_nodes' locks. This patch changes rcu_init_one() to use a separate spin_lock_init() for the root rcu_node structure's lock than is used for that of all of the rest of the rcu_node structures, which puts the root rcu_node structure's lock in its own lockdep class. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12548908983277-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d8d98655c9e7..705f02ac7433 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1647,7 +1647,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - spin_lock_init(&rnp->lock); + if (rnp != rcu_get_root(rsp)) + spin_lock_init(&rnp->lock); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1670,6 +1671,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) INIT_LIST_HEAD(&rnp->blocked_tasks[1]); } } + spin_lock_init(&rcu_get_root(rsp)->lock); } /* -- cgit v1.2.2 From fdc6f192e7e1ae80565af23cc33dc88e3dcdf184 Mon Sep 17 00:00:00 2001 From: Eero Nurkkala Date: Wed, 7 Oct 2009 11:54:26 +0300 Subject: NOHZ: update idle state also when NOHZ is inactive Commit f2e21c9610991e95621a81407cdbab881226419b had unfortunate side effects with cpufreq governors on some systems. If the system did not switch into NOHZ mode ts->inidle is not set when tick_nohz_stop_sched_tick() is called from the idle routine. Therefor all subsequent calls from irq_exit() to tick_nohz_stop_sched_tick() fail to call tick_nohz_start_idle(). This results in bogus idle accounting information which is passed to cpufreq governors. Set the inidle flag unconditionally of the NOHZ active state to keep the idle time accounting correct in any case. [ tglx: Added comment and tweaked the changelog ] Reported-by: Steven Noonan Signed-off-by: Eero Nurkkala Cc: Rik van Riel Cc: Venkatesh Pallipadi Cc: Greg KH Cc: Steven Noonan Cc: stable@kernel.org LKML-Reference: <1254907901.30157.93.camel@eenurkka-desktop> Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a21c061..89aed5933ed4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle) if (!inidle && !ts->inidle) goto end; + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(ts); /* @@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - ts->inidle = 1; - if (need_resched()) goto end; -- cgit v1.2.2 From da085681014fb43d67d9bf6d14bc068e9254bd49 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 7 Oct 2009 11:46:54 -0700 Subject: futex: fix requeue_pi key imbalance If futex_wait_requeue_pi() wakes prior to requeue, we drop the reference to the source futex_key twice, once in handle_early_requeue_pi_wakeup() and once on our way out. Remove the drop from the handle_early_requeue_pi_wakeup() and keep the get/drops together in futex_wait_requeue_pi(). Reported-by: Helge Bahmann Signed-off-by: Darren Hart Cc: Helge Bahmann Cc: Peter Zijlstra Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz Cc: stable-2.6.31 LKML-Reference: <4ACCE21E.5030805@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1e176f3ab26c..c3bb2fce11ba 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2111,7 +2111,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &q->list.plist); - drop_futex_key_refs(&q->key); if (timeout && !timeout->task) ret = -ETIMEDOUT; -- cgit v1.2.2 From d25105e8911bff1dbd68e387f12901c5b1a15fe8 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 9 Oct 2009 12:40:42 +0200 Subject: writeback: account IO throttling wait as iowait It makes sense to do IOWAIT when someone is blocked due to IO throttle, as suggested by Kame and Peter. There is an old comment for not doing IOWAIT on throttle, however it has been mismatching the code for a long time. If we stop accounting IOWAIT for 2.6.32, it could be an undesirable behavior change. So restore the io_schedule. CC: KAMEZAWA Hiroyuki CC: Peter Zijlstra Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1535f3884b88..074f753f7449 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6720,9 +6720,6 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { -- cgit v1.2.2 From 3365e7798760dc6c190a9bbb0945a38f02625438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2009 10:12:41 +0200 Subject: lockdep: Use cpu_clock() for lockstat Some tracepoint magic (TRACE_EVENT(lock_acquired)) relies on the fact that lock hold times are positive and uses div64 on that. That triggered a build warning on MIPS, and probably causes bad output in certain circumstances as well. Make it truly positive. Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra LKML-Reference: <1254818502.21044.112.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3815ac1d58b2..9af56723c096 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static inline u64 lockstat_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + static int lock_point(unsigned long points[], unsigned long ip) { int i; @@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip) return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static void lock_time_inc(struct lock_time *lt, u64 time) { if (time > lt->max) lt->max = time; @@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats) static void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; - s64 holdtime; + u64 holdtime; if (!lock_stat) return; - holdtime = sched_clock() - hlock->holdtime_stamp; + holdtime = lockstat_clock() - hlock->holdtime_stamp; stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) @@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); + hlock->holdtime_stamp = lockstat_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) @@ -3322,7 +3327,7 @@ found_it: if (hlock->instance != lock) return; - hlock->waittime_stamp = sched_clock(); + hlock->waittime_stamp = lockstat_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, @@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - u64 now; - s64 waittime = 0; + u64 now, waittime = 0; int i, cpu; depth = curr->lockdep_depth; @@ -3374,7 +3378,7 @@ found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { - now = sched_clock(); + now = lockstat_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } -- cgit v1.2.2 From f5dc37530ba8a35aae0f7f4f13781d1904f71e94 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 9 Oct 2009 08:35:03 +0200 Subject: sched: Update the clock of runqueue select_task_rq() selected In try_to_wake_up(), we update the runqueue clock, but select_task_rq() may select a different runqueue than the one we updated, leaving the new runqueue's clock stale for a bit. This patch cures occasional huge latencies reported by latencytop when coming out of idle on a mostly idle NO_HZ box. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1255070103.7639.30.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 76c0e9691fc0..e3f8f4f61c8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2311,7 +2311,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; + struct rq *rq, *orig_rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2319,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2350,6 +2350,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, set_task_cpu(p, cpu); rq = task_rq_lock(p, &flags); + + if (rq != orig_rq) + update_rq_clock(rq); + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); -- cgit v1.2.2 From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 7 Oct 2009 17:09:06 +0400 Subject: headers: remove sched.h from interrupt.h After m68k's task_thread_info() doesn't refer to current, it's possible to remove sched.h from interrupt.h and not break m68k! Many thanks to Heiko Carstens for allowing this. Signed-off-by: Alexey Dobriyan --- kernel/irq/handle.c | 1 + kernel/mutex-debug.c | 1 + kernel/time/timekeeping.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80554db..17c71bb565c6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e5a560..ec815a960b5d 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ecd..c3a4e2907eaa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.2 From e17b38bf9e70d74f3739a600db75240078ac1407 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 11 Oct 2009 19:12:00 -0700 Subject: sched: Fix missing kernel-doc notation The following htmldocs warnings: Warning(kernel/sched.c:685): No description found for parameter 'cpu' Warning(kernel/sched.c:3676): No description found for parameter 'sd' Trigger because new parameters were added to update_rq_clock() and update_group_power() without updating the kernel-doc notation. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <4AD29070.7070002@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e3f8f4f61c8e..789001da0a94 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -676,6 +676,7 @@ inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock @@ -3660,6 +3661,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu -- cgit v1.2.2 From 43046b606673c9c991919ff75b980b72541e9ede Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 14 Oct 2009 09:16:42 -0700 Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed work It basically turns a delayed work into an immediate work, and then waits for it to finish. --- kernel/workqueue.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2df93b1..ccefe574dcf7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -639,6 +639,24 @@ int schedule_delayed_work(struct delayed_work *dwork, } EXPORT_SYMBOL(schedule_delayed_work); +/** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use -- cgit v1.2.2 From 8c53e46314562fe814b0afef6cfcbd2f562b017c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 14 Oct 2009 09:16:42 -0700 Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed work It basically turns a delayed work into an immediate work, and then waits for it to finish, thus allowing you to force (and wait for) an immediate flush of a delayed work. We'll want to use this in the tty layer to clean up tty_flush_to_ldisc(). Acked-by: Oleg Nesterov [ Fixed to use 'del_timer_sync()' as noted by Oleg ] Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2df93b1..47cdd7e76f2b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -639,6 +639,24 @@ int schedule_delayed_work(struct delayed_work *dwork, } EXPORT_SYMBOL(schedule_delayed_work); +/** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use -- cgit v1.2.2 From 04bf7539c08d64184736cdc5e4ad617eda77eb0f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 Oct 2009 06:45:02 +0200 Subject: PM: Make warning in suspend_test_finish() less likely to happen Increase TEST_SUSPEND_SECONDS to 10 so the warning in suspend_test_finish() doesn't annoy the users of slower systems so much. Also, make the warning print the suspend-resume cycle time, so that we know why the warning actually triggered. Patch prepared during the hacking session at the Kernel Summit in Tokyo. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/suspend_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1acf9c..25596e450ac7 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* -- cgit v1.2.2 From cf8517cf905b5cd31d5790250b9ac39f7cb8aa53 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:16 -0400 Subject: tracing: Update *ppos instead of filp->f_pos Instead of directly updating filp->f_pos we should update the *ppos argument. The filp->f_pos gets updated within the file_pos_write() function called from sys_write(). Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.399670810@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37ba67e33265..9c451a1930b6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -740,7 +740,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c820b0310a12..b20d3ec75de9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) -- cgit v1.2.2 From 3e69533b51930a7169235db2caf703884e6e3bbb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:17 -0400 Subject: tracing: Fix trace_seq_printf() return value trace_seq_printf() return value is a little ambiguous. It currently returns the length of the space available in the buffer. printf usually returns the amount written. This is not adequate here, because: trace_seq_printf(s, ""); is perfectly legal, and returning 0 would indicate that it failed. We can always see the amount written by looking at the before and after values of s->len. This is not quite the same use as printf. We only care if the string was successfully written to the buffer or not. Make trace_seq_printf() return 0 if the trace oversizes the buffer's free space, 1 otherwise. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.631787612@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed17565826b0..b6c12c6a1bcd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); -- cgit v1.2.2 From 67b394f7f26d84edb7294cc6528ab7ca6daa2ad1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:18 -0400 Subject: tracing: Fix comment typo and documentation example Trivial patch to fix a documentation example and to fix a comment. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.871719877@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff01970547..217f6991184f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2681,7 +2681,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer -- cgit v1.2.2 From 6d3f1e12f46a2f9a1bb7e7aa433df8dd31ce5647 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:19 -0400 Subject: tracing: Remove cpu arg from the rb_time_stamp() function The cpu argument is not used inside the rb_time_stamp() function. Plus fix a typo. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233647.118547500@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 217f6991184f..3ffa502fb243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -483,7 +483,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +494,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +599,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -1868,7 +1868,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -2111,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. -- cgit v1.2.2