From f1b499f029c5dde85d46a8811353c62f29157541 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Thu, 5 Aug 2010 17:10:53 +0200 Subject: lockdep: Remove __debug_show_held_locks There is no longer any functional difference between __debug_show_held_locks() and debug_show_held_locks(), so remove the former. Signed-off-by: John Kacur Cc: Peter Zijlstra LKML-Reference: <1281021054-4228-1-git-send-email-jkacur@redhat.com> Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 2 +- kernel/lockdep.c | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..bca942379559 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - __debug_show_held_locks(t); + debug_show_held_locks(t); touch_nmi_watchdog(); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..84baa71cfda5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3775,7 +3775,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); * Careful: only use this function if you are sure that * the task cannot run in parallel! */ -void __debug_show_held_locks(struct task_struct *task) +void debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); @@ -3783,12 +3783,6 @@ void __debug_show_held_locks(struct task_struct *task) } lockdep_print_held_locks(task); } -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) -- cgit v1.2.2 From 6a103b0d44e9f97dc430002cf3ac7a7defa3819f Mon Sep 17 00:00:00 2001 From: John Kacur Date: Thu, 5 Aug 2010 17:10:54 +0200 Subject: lockup detector: Fix grammar by adding a missing "to" in the comments This fixes a minor grammar problem in the comments in hung_task.c Signed-off-by: John Kacur Cc: Peter Zijlstra LKML-Reference: <1281021054-4228-2-git-send-email-jkacur@redhat.com> Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bca942379559..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * exit the grace period. For classic RCU, a reschedule is required. + * to exit the grace period. For classic RCU, a reschedule is required. */ static void rcu_lock_break(struct task_struct *g, struct task_struct *t) { -- cgit v1.2.2 From ca5ecddfa8fcbd948c95530e7e817cee9fb43a3d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Apr 2010 14:39:09 -0700 Subject: rcu: define __rcu address space modifier for sparse This commit provides definitions for the __rcu annotation defined earlier. This annotation permits sparse to check for correct use of RCU-protected pointers. If a pointer that is annotated with __rcu is accessed directly (as opposed to via rcu_dereference(), rcu_assign_pointer(), or one of their variants), sparse can be made to complain. To enable such complaints, use the new default-disabled CONFIG_SPARSE_RCU_POINTER kernel configuration option. Please note that these sparse complaints are intended to be a debugging aid, -not- a code-style-enforcement mechanism. There are special rcu_dereference_protected() and rcu_access_pointer() accessors for use when RCU read-side protection is not required, for example, when no other CPU has access to the data structure in question or while the current CPU hold the update-side lock. This patch also updates a number of docbook comments that were showing their age. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Christopher Li Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..6c79e851521c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the * CONFIG_PROVE_RCU and not cases. Note that if someone uses * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ -- cgit v1.2.2 From 67bdbffd696f29a0b68aa8daa285783a06651583 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2010 16:55:13 +0100 Subject: rculist: avoid __rcu annotations This avoids warnings from missing __rcu annotations in the rculist implementation, making it possible to use the same lists in both RCU and non-RCU cases. We can add rculist annotations later, together with lockdep support for rculist, which is missing as well, but that may involve changing all the users. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Reviewed-by: Josh Triplett --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..0f90c2f713f1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) -- cgit v1.2.2 From 2c392b8c3450ceb69ba1b93cb0cddb3998fb8cdc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Feb 2010 19:41:39 +0100 Subject: cgroups: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Acked-by: Paul Menage Cc: Li Zefan Reviewed-by: Josh Triplett --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 192f88c5b0f9..e5c5497a7dca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ -- cgit v1.2.2 From e546f485e1d7520ca0200502cdcc11b503f4805c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 21 Jun 2010 16:57:42 +0800 Subject: rcutorture: add random preemption Add random preemption to help we to torture the preemptable rcu. srcu_read_delay() also calls rcu_read_delay() for shorter delays. Added comment to preempt_schedule() call indicating that no quiescent states happen if preemption is disabled. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..729710273dcb 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -303,6 +303,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) mdelay(longdelay_ms); if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -536,6 +540,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); } static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -- cgit v1.2.2 From 394f99a9007d4274f7076bb8553ab0ff9707688b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 28 Jun 2010 16:25:04 +0800 Subject: rcu: simplify the usage of percpu data &percpu_data is compatible with allocated percpu data. And we use it and remove the "->rda[NR_CPUS]" array, saving significant storage on systems with large numbers of CPUs. This does add an additional level of indirection and thus an additional cache line referenced, but because ->rda is not used on the read side, this is OK. Signed-off-by: Lai Jiangshan Reviewed-by: Tejun Heo Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 42 +++++++++++++++--------------------------- kernel/rcutree.h | 2 +- kernel/rcutree_plugin.h | 4 ++-- kernel/rcutree_trace.c | 2 +- 4 files changed, 19 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..5b1c3c231bae 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -712,7 +712,7 @@ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { @@ -960,7 +960,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) { int i; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ @@ -984,7 +984,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) struct rcu_data *rdp; raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); if (rsp->orphan_cbs_list == NULL) { raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; @@ -1007,7 +1007,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) unsigned long flags; unsigned long mask; int need_report = 0; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -1226,7 +1226,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } if (mask != 0) { @@ -1402,7 +1403,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); rcu_process_gp_end(rsp, rdp); check_for_new_grace_period(rsp, rdp); @@ -1701,7 +1702,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1729,7 +1730,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1865,7 +1866,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) { static char *buf[] = { "rcu_node_level_0", "rcu_node_level_1", @@ -1918,37 +1920,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } + rsp->rda = rda; rnp = rsp->level[NUM_RCU_LVLS - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - rsp->rda[i]->mynode = rnp; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } } -/* - * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used - * nowhere else! Assigns leaf node pointers into each CPU's rcu_data - * structure. - */ -#define RCU_INIT_FLAVOR(rsp, rcu_data) \ -do { \ - int i; \ - \ - for_each_possible_cpu(i) { \ - (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - } \ - rcu_init_one(rsp); \ -} while (0) - void __init rcu_init(void) { int cpu; rcu_bootup_announce(); - RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); - RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..5ce197e87792 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -283,7 +283,7 @@ struct rcu_state { struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..9906f85c7780 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = rcu_preempt_state.rda[cpu]; + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -771,7 +771,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) */ static void __init __rcu_init_preempt(void) { - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } /* diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..458e032a3a30 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -262,7 +262,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) struct rcu_data *rdp; for_each_possible_cpu(cpu) { - rdp = rsp->rda[cpu]; + rdp = per_cpu_ptr(rsp->rda, cpu); if (rdp->beenonline) print_one_rcu_pending(m, rdp); } -- cgit v1.2.2 From 4221a9918e38b7494cee341dda7b7b4bb8c04bde Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 26 Jun 2010 01:08:19 +0900 Subject: Add RCU check for find_task_by_vpid(). find_task_by_vpid() says "Must be called under rcu_read_lock().". But due to commit 3120438 "rcu: Disable lockdep checking in RCU list-traversal primitives", we are currently unable to catch "find_task_by_vpid() with tasklist_lock held but RCU lock not held" errors due to the RCU-lockdep checks being suppressed in the RCU variants of the struct list_head traversals. This commit therefore places an explicit check for being in an RCU read-side critical section in find_task_by_pid_ns(). =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/pid.c:386 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by rc.sysinit/1102: #0: (tasklist_lock){.+.+..}, at: [] sys_setpgid+0x40/0x160 stack backtrace: Pid: 1102, comm: rc.sysinit Not tainted 2.6.35-rc3-dirty #1 Call Trace: [] lockdep_rcu_dereference+0x94/0xb0 [] find_task_by_pid_ns+0x6d/0x70 [] find_task_by_vpid+0x18/0x20 [] sys_setpgid+0x47/0x160 [] sysenter_do_call+0x12/0x36 Commit updated to use a new rcu_lockdep_assert() exported API rather than the old internal __do_rcu_dereference(). Signed-off-by: Tetsuo Handa Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 0f90c2f713f1..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } -- cgit v1.2.2 From b163760e37047781b37c412cde54d146ac4b651f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2010 16:21:38 -0700 Subject: rcu: make CPU stall warning timeout configurable Also set the default to 60 seconds, up from the previous hard-coded timeout of 10 seconds. This allows people who care to set short timeouts, while avoiding people with unusual configurations (make randconfig!!!) from being bothered with spurious CPU stall warnings. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5ce197e87792..183ebf405315 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -254,9 +254,10 @@ struct rcu_data { #define RCU_STALL_DELAY_DELTA 0 #endif -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ -- cgit v1.2.2 From 742734eea0cf5314cde5945963ed964be167bd84 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Jun 2010 11:43:52 -0700 Subject: rcu: add boot parameter to suppress RCU CPU stall warning messages Although the RCU CPU stall warning messages are a very good way to alert people to a problem, once alerted, it is sometimes helpful to shut them off in order to avoid obscuring other messages that might be being used to track down the problem. Although you can rebuild the kernel with CONFIG_RCU_CPU_STALL_DETECTOR=n, this is sometimes inconvenient. This commit therefore adds a boot parameter named "rcu_cpu_stall_suppress" that shuts these messages off without requiring a rebuild (though a reboot might be needed for those not brave enough to patch their kernel while it is running). This message-suppression was already in place for the panic case, so this commit need only rename the variable and export it via module_param(). Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b1c3c231bae..f3d5906cbc21 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -143,6 +143,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_suppress __read_mostly; +module_param(rcu_cpu_stall_suppress, int, 0); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_panicking __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) { @@ -530,7 +535,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; - if (rcu_cpu_stall_panicking) + if (rcu_cpu_stall_suppress) return; delta = jiffies - rsp->jiffies_stall; rnp = rdp->mynode; @@ -548,7 +553,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { - rcu_cpu_stall_panicking = 1; + rcu_cpu_stall_suppress = 1; return NOTIFY_DONE; } -- cgit v1.2.2 From 77d8485a8b5416c615b6acd95f01bfcacd7d81ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Jul 2010 17:38:59 -0700 Subject: rcu: improve kerneldoc for rcu_read_lock(), call_rcu(), and synchronize_rcu() Make it explicit that new RCU read-side critical sections that start after call_rcu() and synchronize_rcu() start might still be running after the end of the relevant grace period. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 9906f85c7780..63bb7714fdeb 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -546,9 +546,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. */ void synchronize_rcu(void) { -- cgit v1.2.2 From f2e0dd7090eddef427ab9d9f81de122244cded51 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Jul 2010 14:38:30 -0700 Subject: rcu: allow RCU CPU stall warning messages to be controlled in /sys Set the permissions of the rcu_cpu_stall_suppress to 644 to enable RCU CPU stall warnings to be enabled and disabled at runtime via sysfs. Suggested-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f3d5906cbc21..5d910beefff2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -145,7 +145,7 @@ module_param(qlowmark, int, 0); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR int rcu_cpu_stall_suppress __read_mostly; -module_param(rcu_cpu_stall_suppress, int, 0); +module_param(rcu_cpu_stall_suppress, int, 0644); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); -- cgit v1.2.2 From 910b1b7e19a292ff685001caf1bf1a9775b771a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Jul 2010 08:05:56 -0700 Subject: rcu: Allow RCU CPU stall warnings to be off at boot, but manually enablable Currently, if RCU CPU stall warnings are enabled, they are enabled immediately upon boot. They can be manually disabled via /sys (and also re-enabled via /sys), and are automatically disabled upon panic. However, some users need RCU CPU stalls to be disabled at boot time, but to be enabled without rebuilding/rebooting. For example, someone running a real-time application in production might not want the additional latency of RCU CPU stall detection in normal operation, but might need to enable it at any point for fault isolation purposes. This commit therefore provides a new CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE kernel configuration parameter that maintains the current behavior (enable at boot) by default, but allows a kernel to be configured with RCU CPU stall detection built into the kernel, but disabled at boot time. Requested-by: Clark Williams Requested-by: John Kacur Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5d910beefff2..5aab7dabd0d5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -144,7 +144,7 @@ module_param(qhimark, int, 0); module_param(qlowmark, int, 0); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_suppress __read_mostly; +int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; module_param(rcu_cpu_stall_suppress, int, 0644); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 183ebf405315..bb4d08695c45 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -264,6 +264,12 @@ struct rcu_data { /* scheduling clock irq */ /* before ratting on them. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE +#define RCU_CPU_STALL_SUPPRESS_INIT 0 +#else +#define RCU_CPU_STALL_SUPPRESS_INIT 1 +#endif + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -- cgit v1.2.2 From 277b199800ac90811ac86d215063df1984f51619 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 20 Aug 2010 11:03:51 +0800 Subject: lockup_detector: Make callback function static watchdog_overflow_callback() is only used in kernel/watchdog.c. Signed-off-by: Lin Ming Cc: Peter Zijlstra Cc: Don Zickus LKML-Reference: <1282273431.16443.32.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 613bc1f04610..b60e2a869bba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -202,7 +202,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { -- cgit v1.2.2 From a57eb940d130477a799dfb24a570ee04979c0f7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Jun 2010 16:49:16 -0700 Subject: rcu: Add a TINY_PREEMPT_RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a small-memory-footprint uniprocessor-only implementation of preemptible RCU. This implementation uses but a single blocked-tasks list rather than the combinatorial number used per leaf rcu_node by TREE_PREEMPT_RCU, which reduces memory consumption and greatly simplifies processing. This version also takes advantage of uniprocessor execution to accelerate grace periods in the case where there are no readers. The general design is otherwise broadly similar to that of TREE_PREEMPT_RCU. This implementation is a step towards having RCU implementation driven off of the SMP and PREEMPT kernel configuration variables, which can happen once this implementation has accumulated sufficient experience. Removed ACCESS_ONCE() from __rcu_read_unlock() and added barrier() as suggested by Steve Rostedt in order to avoid the compiler-reordering issue noted by Mathieu Desnoyers (http://lkml.org/lkml/2010/8/16/183). As can be seen below, CONFIG_TINY_PREEMPT_RCU represents almost 5Kbyte savings compared to CONFIG_TREE_PREEMPT_RCU. Of course, for non-real-time workloads, CONFIG_TINY_RCU is even better. CONFIG_TREE_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 6170 825 28 7023 kernel/rcutree.o ---- 7026 Total CONFIG_TINY_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 2081 81 8 2170 kernel/rcutiny.o ---- 2183 Total CONFIG_TINY_RCU (non-preemptible) text data bss dec filename 13 0 0 13 kernel/rcupdate.o 719 25 0 744 kernel/rcutiny.o --- 757 Total Requested-by: Loïc Minier Signed-off-by: Paul E. McKenney --- kernel/Makefile | 1 + kernel/rcutiny.c | 33 ++- kernel/rcutiny_plugin.h | 582 +++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 594 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..17046b6e7c90 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -86,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Forward declarations for rcutiny_plugin.h. */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); } /* @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, } /* - * Post an RCU callback to be invoked after the end of an RCU grace + * Post an RCU callback to be invoked after the end of an RCU-sched grace * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { __call_rcu(head, func, &rcu_sched_ctrlblk); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Post an RCU bottom-half callback to be invoked after any subsequent @@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - void rcu_barrier_bh(void) { struct rcu_synchronize rcu; @@ -289,5 +286,3 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } - -#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..e6bc1b447c6c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -1,7 +1,7 @@ /* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,11 +17,587 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright IBM Corporation, 2009 + * Copyright (c) 2010 Linaro * * Author: Paul E. McKenney */ +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include + +/* FIXME: merge with definitions in kernel/rcutree.h. */ +#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) +#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is not such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* + * If there is no GP, or if blocked readers are still blocking GP, + * then there is nothing more to do. + */ + if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + list_del(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; + INIT_LIST_HEAD(&t->rcu_node_entry); + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (!rcu_preempt_running_reader() && rcu_preempt_gp_in_progress()) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + raise_softirq(RCU_SOFTIRQ); + if (rcu_preempt_gp_in_progress() && rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from __rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + local_irq_restore(flags); + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (rcu_preempted_readers_exp()) + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include -- cgit v1.2.2 From 8cdd32a918350430483751feaae1c19cef816f69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Aug 2010 14:23:03 -0700 Subject: rcu: refer RCU CPU stall-warning victims to stallwarn.txt There is some documentation on RCU CPU stall warnings contained in Documentation/RCU/stallwarn.txt, but it will not be apparent to someone who runs into such a warning while under time pressure. This commit therefore adds comments preceding the printk()s pointing out the location of this documentation. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5aab7dabd0d5..ff214118e4b8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -487,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* OK, time to rat on our buddy... */ - + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { @@ -517,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); -- cgit v1.2.2 From 53d84e004d5e8c018be395c4330dc72fd60bd13e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Aug 2010 14:28:53 -0700 Subject: rcu: permit suppressing current grace period's CPU stall warnings When using a kernel debugger, a long sojourn in the debugger can get you lots of RCU CPU stall warnings once you resume. This might not be helpful, especially if you are using the system console. This patch therefore allows RCU CPU stall warnings to be suppressed, but only for the duration of the current set of grace periods. This differs from Jason's original patch in that it adds support for tiny RCU and preemptible RCU, and uses a slightly different method for suppressing the RCU CPU stall warning messages. Signed-off-by: Jason Wessel Signed-off-by: Paul E. McKenney Tested-by: Jason Wessel --- kernel/rcutree.c | 20 ++++++++++++++++++++ kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ff214118e4b8..42140a860bb9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -565,6 +565,22 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) return NOTIFY_DONE; } +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + static struct notifier_block rcu_panic_block = { .notifier_call = rcu_panic, }; @@ -584,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +void rcu_cpu_stall_reset(void) +{ +} + static void __init check_cpu_stall_init(void) { } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index bb4d08695c45..7abd439a7573 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -372,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 63bb7714fdeb..561410f70d4a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -417,6 +417,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) } } +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -867,6 +877,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { } +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* -- cgit v1.2.2 From a3dc3fb161f9b4066c0fce22db72638af8baf83b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Aug 2010 16:16:25 -0700 Subject: rcu: repair code-duplication FIXMEs Combine the duplicate definitions of ULONG_CMP_GE(), ULONG_CMP_LT(), and rcu_preempt_depth() into include/linux/rcupdate.h. Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 4 ---- kernel/rcutree.h | 3 --- 2 files changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index e6bc1b447c6c..c5bea1137dcb 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -26,10 +26,6 @@ #include -/* FIXME: merge with definitions in kernel/rcutree.h. */ -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) - /* Global control variables for preemptible RCU. */ struct rcu_preempt_ctrlblk { struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7abd439a7573..7918ba61873f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -272,9 +272,6 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) - /* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) -- cgit v1.2.2 From 7b0b759b65247cbc66384a912be9acf8d4800636 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Aug 2010 14:18:46 -0700 Subject: rcu: combine duplicate code, courtesy of CONFIG_PREEMPT_RCU The CONFIG_PREEMPT_RCU kernel configuration parameter was recently re-introduced, but as an indication of the type of RCU (preemptible vs. non-preemptible) instead of as selecting a given implementation. This commit uses CONFIG_PREEMPT_RCU to combine duplicate code from include/linux/rcutiny.h and include/linux/rcutree.h into include/linux/rcupdate.h. This commit also combines a few other pieces of duplicate code that have accumulated. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 561410f70d4a..87f60f06b18e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -938,15 +938,6 @@ static void rcu_preempt_process_callbacks(void) { } -/* - * In classic RCU, call_rcu() is just call_rcu_sched(). - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptable RCU does not exist, map to rcu-sched. -- cgit v1.2.2 From 80dcf60e6b97c7363971e7a0a788d8484d35f8a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Aug 2010 16:57:45 -0700 Subject: rcu: apply TINY_PREEMPT_RCU read-side speedup to TREE_PREEMPT_RCU Replace one of the ACCESS_ONCE() calls in each of __rcu_read_lock() and __rcu_read_unlock() with barrier() as suggested by Steve Rostedt in order to avoid the potential compiler-optimization-induced bug noted by Mathieu Desnoyers. Located-by: Mathieu Desnoyers Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 87f60f06b18e..e9e0bc74ff37 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) */ void __rcu_read_lock(void) { - ACCESS_ONCE(current->rcu_read_lock_nesting)++; + current->rcu_read_lock_nesting++; barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); #ifdef CONFIG_PROVE_LOCKING -- cgit v1.2.2 From dd7c4d89730a1be2c1d361a8ae1f0fe9465ccf9c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Aug 2010 10:51:17 -0700 Subject: rcu: performance fixes to TINY_PREEMPT_RCU callback checking This commit tightens up checks in rcu_preempt_check_callbacks() to avoid unnecessary special handling at rcu_read_unlock() time. Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index c5bea1137dcb..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -72,7 +72,7 @@ static void rcu_report_exp_done(void); /* * Return true if the CPU has not yet responded to the current grace period. */ -static int rcu_cpu_cur_gp(void) +static int rcu_cpu_blocking_cur_gp(void) { return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; } @@ -229,7 +229,7 @@ void rcu_preempt_note_context_switch(void) * cannot end. */ list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); - if (rcu_cpu_cur_gp()) + if (rcu_cpu_blocking_cur_gp()) rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; } @@ -368,12 +368,16 @@ static void rcu_preempt_check_callbacks(void) { struct task_struct *t = current; - if (!rcu_preempt_running_reader() && rcu_preempt_gp_in_progress()) + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) rcu_preempt_cpu_qs(); if (&rcu_preempt_ctrlblk.rcb.rcucblist != rcu_preempt_ctrlblk.rcb.donetail) raise_softirq(RCU_SOFTIRQ); - if (rcu_preempt_gp_in_progress() && rcu_preempt_running_reader()) + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } -- cgit v1.2.2 From 81a294c44e973dc7182e4733421b7cb2daba3c29 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Aug 2010 09:52:50 -0700 Subject: rcu: fix _oddness handling of verbose stall warnings CONFIG_RCU_CPU_STALL_VERBOSE depends on CONFIG_TREE_PREEMPT_RCU, but rcu_bootup_announce_oddness() complains if CONFIG_RCU_CPU_STALL_VERBOSE is not set even in the case of CONFIG_TREE_RCU. This commit therefore fixes rcu_bootup_announce_oddness() to avoid insisting on impossibilities. Reported-by: Guy Martin Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index e9e0bc74ff37..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU-based detection of stalled CPUs is disabled.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_VERBOSE +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif #if NUM_RCU_LVL_4 != 0 -- cgit v1.2.2 From ed2d372c0738386b8a184a6a6bea9c16df6ffb68 Mon Sep 17 00:00:00 2001 From: Christian Dietrich Date: Mon, 6 Sep 2010 16:37:05 +0200 Subject: sched: Remove unnecessary #ifdef CONFIG_SMP The CONFIG_SMP ifdef isn't necessary at this point, because it is checked in an outer ifdef level already and has no effect here. Cleanup only, no functional effect. Signed-off-by: Christian Dietrich Cc: vamos-dev@i4.informatik.uni-erlangen.de Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Tejun Heo LKML-Reference: <7a3a39ef3f765a4473cb026b1f204059568a7098.1283782701.git.qy03fugy@stud.informatik.uni-erlangen.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 09b574e7f4df..8eef8e5512d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -426,9 +426,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -437,7 +435,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. -- cgit v1.2.2 From f269893c575167447cc9f6d1867e639fb5b6f0c5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Aug 2010 10:28:15 +0200 Subject: sched: Merge cpu_to_core_group functions Merge and simplify the two cpu_to_core_group variants so that the resulting function follows the same pattern like cpu_to_phys_group. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <20100831082843.953617555@de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8eef8e5512d4..1a0c084b1cf9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6552,31 +6552,23 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct static_sched_domain, core_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; - +#ifdef CONFIG_SCHED_SMT cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); +#else + group = cpu; +#endif if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } -#elif defined(CONFIG_SCHED_MC) -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; -} -#endif +#endif /* CONFIG_SCHED_MC */ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); -- cgit v1.2.2 From 01a08546af311c065f34727787dd0cc8dc0c216f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Aug 2010 10:28:16 +0200 Subject: sched: Add book scheduling domain On top of the SMT and MC scheduling domains this adds the BOOK scheduling domain. This is useful for NUMA like machines which do not have an interface which tells which piece of memory is attached to which node or where the hardware performs striping. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <20100831082844.253053798@de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1a0c084b1cf9..26f83e2f1534 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6506,6 +6506,7 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t this_sibling_map; cpumask_var_t this_core_map; + cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6517,6 +6518,7 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, + sa_this_book_map, sa_this_core_map, sa_this_sibling_map, sa_nodemask, @@ -6570,6 +6572,31 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map, } #endif /* CONFIG_SCHED_MC */ +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); + +static int +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) +{ + int group = cpu; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); +#endif + if (sg) + *sg = &per_cpu(sched_group_book, group).sg; + return group; +} +#endif /* CONFIG_SCHED_BOOK */ + static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6578,7 +6605,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK + cpumask_and(mask, cpu_book_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC) cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6839,6 +6869,9 @@ SD_INIT_FUNC(CPU) #ifdef CONFIG_SCHED_MC SD_INIT_FUNC(MC) #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif static int default_relax_domain_level = -1; @@ -6888,6 +6921,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_book_map: + free_cpumask_var(d->this_book_map); /* fall through */ case sa_this_core_map: free_cpumask_var(d->this_core_map); /* fall through */ case sa_this_sibling_map: @@ -6934,8 +6969,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) return sa_this_core_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_book_map; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -6993,6 +7030,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK + sd = &per_cpu(book_domains, i).sd; + SD_INIT(sd, BOOK); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + static struct sched_domain *__build_mc_sched_domain(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *parent, int i) @@ -7049,6 +7103,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_core_group, d->send_covered, d->tmpmask); break; +#endif +#ifdef CONFIG_SCHED_BOOK + case SD_LV_BOOK: /* set up book groups */ + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); + if (cpu == cpumask_first(d->this_book_map)) + init_sched_build_groups(d->this_book_map, cpu_map, + &cpu_to_book_group, + d->send_covered, d->tmpmask); + break; #endif case SD_LV_CPU: /* set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); @@ -7097,12 +7160,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); } @@ -7133,6 +7198,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, init_sched_groups_power(i, sd); } #endif +#ifdef CONFIG_SCHED_BOOK + for_each_cpu(i, cpu_map) { + sd = &per_cpu(book_domains, i).sd; + init_sched_groups_power(i, sd); + } +#endif for_each_cpu(i, cpu_map) { sd = &per_cpu(phys_domains, i).sd; @@ -7158,6 +7229,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) + sd = &per_cpu(book_domains, i).sd; #else sd = &per_cpu(phys_domains, i).sd; #endif -- cgit v1.2.2 From 8af3c153baf95374eff20a37f00c59a295b52756 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Tue, 7 Sep 2010 16:43:46 +0200 Subject: ntp: Clamp PLL update interval Clamp update interval to reduce PLL gain with low sampling rate (e.g. intermittent network connection) to avoid instability. The clamp roughly corresponds to the loop time constant, it's 8 * poll interval for SHIFT_PLL 2 and 32 * poll interval for SHIFT_PLL 4. This gives good results without affecting the gain in normal conditions where ntpd skips only up to seven consecutive samples. Signed-off-by: Miroslav Lichvar Acked-by: john stultz LKML-Reference: <1283870626-9472-1-git-send-email-mlichvar@redhat.com> Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c63116863a80..d2321891538f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -149,10 +149,18 @@ static void ntp_update_offset(long offset) time_reftime = get_seconds(); offset64 = offset; - freq_adj = (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + freq_adj = ntp_update_offset_fll(offset64, secs); - freq_adj += ntp_update_offset_fll(offset64, secs); + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); -- cgit v1.2.2 From 7740191cd909b75d75685fb08a5d1f54b8a9d28b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 13 Sep 2010 17:47:00 -0400 Subject: sched: Fix string comparison in /proc/sched_features Fix incorrect handling of the following case: INTERACTIVE INTERACTIVE_SOMETHING_ELSE The comparison only checks up to each element's length. Changelog since v1: - Embellish using some Rostedtisms. [ mingo: ^^ == smaller and cleaner ] Signed-off-by: Mathieu Desnoyers Reviewed-by: Steven Rostedt Cc: Cc: Peter Zijlstra Cc: Tony Lindgren LKML-Reference: <20100913214700.GB16118@Krystal> Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 26f83e2f1534..b40b82e33590 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -721,7 +721,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -732,6 +732,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -739,9 +740,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else -- cgit v1.2.2 From 31915ab4cbf507aadab40847cf9989da5e88b090 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Sep 2010 14:42:25 +0200 Subject: sched: Remove branch hints within context_switch() With 710390d9 "sched: Optimize branch hint in context_switch()" the branch hint logic within context_switch() got inversed. In fact the hints "if (likely(!mm))" and "if (likely(!prev->mm))" mean that it is likely that the previous and next task are kernel threads. That assumption is certainly counter intuitive, but Tim has shown that at least with his workload this is true. Nevertheless the truth is: it depends on the current workload. So just remove the annotations which also improves readability. Reported-by: Tim Blechmann Signed-off-by: Heiko Carstens Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20100916124225.GA2209@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b40b82e33590..16a1129f51ec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2845,14 +2845,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (likely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (likely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } -- cgit v1.2.2 From 1dcc41bb037533839753df983d31778b30b67d93 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 14 Sep 2010 21:43:46 +0900 Subject: futex: Change 3rd arg of fetch_robust_entry() to unsigned int* Sparse complains: kernel/futex.c:2495:59: warning: incorrect type in argument 3 (different signedness) Make 3rd argument of fetch_robust_entry() 'unsigned int'. Signed-off-by: Namhyung Kim Cc: Peter Zijlstra Cc: Darren Hart LKML-Reference: <1284468228-8723-1-git-send-email-namhyung@gmail.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- kernel/futex_compat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..464de2751ff9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2458,7 +2458,7 @@ retry: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, - int *pi) + unsigned int *pi) { unsigned long uentry; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e5..06da4dfc339b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -19,7 +19,7 @@ */ static inline int fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, int *pi) + compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; -- cgit v1.2.2 From a3c74c52570c0c4ac90c9a0216de800c39089ba7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 14 Sep 2010 21:43:47 +0900 Subject: futex: Mark restart_block.futex.uaddr[2] __user @uaddr and @uaddr2 fields in restart_block.futex are user pointers. Add __user and remove unnecessary casts. Signed-off-by: Namhyung Kim Cc: Peter Zijlstra Cc: Darren Hart LKML-Reference: <1284468228-8723-2-git-send-email-namhyung@gmail.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 464de2751ff9..45e448a5e440 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1843,7 +1843,7 @@ retry: restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; + restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; @@ -1869,7 +1869,7 @@ out: static long futex_wait_restart(struct restart_block *restart) { - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + u32 __user *uaddr = restart->futex.uaddr; int fshared = 0; ktime_t t, *tp = NULL; -- cgit v1.2.2 From 15e408cd6ccc3f4f453d87ccd5bc7a84d59feb96 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 14 Sep 2010 21:43:48 +0900 Subject: futex: Add lock context annotations queue_lock/unlock/me() and unqueue_me_pi() grab/release spinlocks but are missing proper annotations. Add them. Signed-off-by: Namhyung Kim Cc: Peter Zijlstra Cc: Darren Hart LKML-Reference: <1284468228-8723-3-git-send-email-namhyung@gmail.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 45e448a5e440..92a31d4cd564 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1360,6 +1360,7 @@ out: /* The key must be already stored in q->key. */ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) { struct futex_hash_bucket *hb; @@ -1373,6 +1374,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { spin_unlock(&hb->lock); drop_futex_key_refs(&q->key); @@ -1391,6 +1393,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * an example). */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { int prio; @@ -1471,6 +1474,7 @@ retry: * and dropped here. */ static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) { WARN_ON(plist_node_empty(&q->list)); plist_del(&q->list, &q->list.plist); -- cgit v1.2.2 From 58b26c4c025778c09c7a1438ff185080e11b7d0a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Sep 2010 18:19:17 -0700 Subject: sched: Increment cache_nice_tries only on periodic lb scheduler uses cache_nice_tries as an indicator to do cache_hot and active load balance, when normal load balance fails. Currently, this value is changed on any failed load balance attempt. That ends up being not so nice to workloads that enter/exit idle often, as they do more frequent new_idle balance and that pretty soon results in cache hot tasks being pulled in. Making the cache_nice_tries ignore failed new_idle balance seems to make better sense. With that only the failed load balance in periodic load balance gets accounted and the rate of accumulation of cache_nice_tries will not depend on idle entry/exit (short running sleep-wakeup kind of tasks). This reduces movement of cache_hot tasks. schedstat diff (after-before) excerpt from a workload that has frequent and short wakeup-idle pattern (:2 in cpu col below refers to NEWIDLE idx) This snapshot was across ~400 seconds. Without this change: domainstats: domain0 cpu cnt bln fld imb gain hgain nobusyq nobusyg 0:2 306487 219575 73167 110069413 44583 19070 1172 218403 1:2 292139 194853 81421 120893383 50745 21902 1259 193594 2:2 283166 174607 91359 129699642 54931 23688 1287 173320 3:2 273998 161788 93991 132757146 57122 24351 1366 160422 4:2 289851 215692 62190 83398383 36377 13680 851 214841 5:2 316312 222146 77605 117582154 49948 20281 988 221158 6:2 297172 195596 83623 122133390 52801 21301 929 194667 7:2 283391 178078 86378 126622761 55122 22239 928 177150 8:2 297655 210359 72995 110246694 45798 19777 1125 209234 9:2 297357 202011 79363 119753474 50953 22088 1089 200922 10:2 278797 178703 83180 122514385 52969 22726 1128 177575 11:2 272661 167669 86978 127342327 55857 24342 1195 166474 12:2 293039 204031 73211 110282059 47285 19651 948 203083 13:2 289502 196762 76803 114712942 49339 20547 1016 195746 14:2 264446 169609 78292 115715605 50459 21017 982 168627 15:2 260968 163660 80142 116811793 51483 21281 1064 162596 With this change: domainstats: domain0 cpu cnt bln fld imb gain hgain nobusyq nobusyg 0:2 272347 187380 77455 105420270 24975 1 953 186427 1:2 267276 172360 86234 116242264 28087 6 1028 171332 2:2 259769 156777 93281 123243134 30555 1 1043 155734 3:2 250870 143129 97627 127370868 32026 6 1188 141941 4:2 248422 177116 64096 78261112 22202 2 757 176359 5:2 275595 180683 84950 116075022 29400 6 778 179905 6:2 262418 162609 88944 119256898 31056 4 817 161792 7:2 252204 147946 92646 122388300 32879 4 824 147122 8:2 262335 172239 81631 110477214 26599 4 864 171375 9:2 261563 164775 88016 117203621 28331 3 849 163926 10:2 243389 140949 93379 121353071 29585 2 909 140040 11:2 242795 134651 98310 124768957 30895 2 1016 133635 12:2 255234 166622 79843 104696912 26483 4 746 165876 13:2 244944 151595 83855 109808099 27787 3 801 150794 14:2 241301 140982 89935 116954383 30403 6 845 140137 15:2 232271 128564 92821 119185207 31207 4 1416 127148 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1284167957-3675-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a171138a9402..aa16cf1eb8fe 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3031,7 +3031,14 @@ redo: if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), this_cpu)) { -- cgit v1.2.2 From 43fa5460fe60dea5c610490a1d263415419c60f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Sep 2010 22:40:03 -0400 Subject: sched: Try not to migrate higher priority RT tasks When first working on the RT scheduler design, we concentrated on keeping all CPUs running RT tasks instead of having multiple RT tasks on a single CPU waiting for the migration thread to move them. Instead we take a more proactive stance and push or pull RT tasks from one CPU to another on wakeup or scheduling. When an RT task wakes up on a CPU that is running another RT task, instead of preempting it and killing the cache of the running RT task, we look to see if we can migrate the RT task that is waking up, even if the RT task waking up is of higher priority. This may sound a bit odd, but RT tasks should be limited in migration by the user anyway. But in practice, people do not do this, which causes high prio RT tasks to bounce around the CPUs. This becomes even worse when we have priority inheritance, because a high prio task can block on a lower prio task and boost its priority. When the lower prio task wakes up the high prio task, if it happens to be on the same CPU it will migrate off of it. But in reality, the above does not happen much either, because the wake up of the lower prio task, which has already been boosted, if it was on the same CPU as the higher prio task, it would then migrate off of it. But anyway, we do not want to migrate them either. To examine the scheduling, I created a test program and examined it under kernelshark. The test program created CPU * 2 threads, where each thread had a different priority. The program takes different options. The options used in this change log was to have priority inheritance mutexes or not. All threads did the following loop: static void grab_lock(long id, int iter, int l) { ftrace_write("thread %ld iter %d, taking lock %d\n", id, iter, l); pthread_mutex_lock(&locks[l]); ftrace_write("thread %ld iter %d, took lock %d\n", id, iter, l); busy_loop(nr_tasks - id); ftrace_write("thread %ld iter %d, unlock lock %d\n", id, iter, l); pthread_mutex_unlock(&locks[l]); } void *start_task(void *id) { [...] while (!done) { for (l = 0; l < nr_locks; l++) { grab_lock(id, i, l); ftrace_write("thread %ld iter %d sleeping\n", id, i); ms_sleep(id); } i++; } [...] } The busy_loop(ms) keeps the CPU spinning for ms milliseconds. The ms_sleep(ms) sleeps for ms milliseconds. The ftrace_write() writes to the ftrace buffer to help analyze via ftrace. The higher the id, the higher the prio, the shorter it does the busy loop, but the longer it spins. This is usually the case with RT tasks, the lower priority tasks usually run longer than higher priority tasks. At the end of the test, it records the number of loops each thread took, as well as the number of voluntary preemptions, non-voluntary preemptions, and number of migrations each thread took, taking the information from /proc/$$/sched and /proc/$$/status. Running this on a 4 CPU processor, the results without changes to the kernel looked like this: Task vol nonvol migrated iterations ---- --- ------ -------- ---------- 0: 53 3220 1470 98 1: 562 773 724 98 2: 752 933 1375 98 3: 749 39 697 98 4: 758 5 515 98 5: 764 2 679 99 6: 761 2 535 99 7: 757 3 346 99 total: 5156 4977 6341 787 Each thread regardless of priority migrated a few hundred times. The higher priority tasks, were a little better but still took quite an impact. By letting higher priority tasks bump the lower prio task from the CPU, things changed a bit: Task vol nonvol migrated iterations ---- --- ------ -------- ---------- 0: 37 2835 1937 98 1: 666 1821 1865 98 2: 654 1003 1385 98 3: 664 635 973 99 4: 698 197 352 99 5: 703 101 159 99 6: 708 1 75 99 7: 713 1 2 99 total: 4843 6594 6748 789 The total # of migrations did not change (several runs showed the difference all within the noise). But we now see a dramatic improvement to the higher priority tasks. (kernelshark showed that the watchdog timer bumped the highest priority task to give it the 2 count. This was actually consistent with every run). Notice that the # of iterations did not change either. The above was with priority inheritance mutexes. That is, when the higher prority task blocked on a lower priority task, the lower priority task would inherit the higher priority task (which shows why task 6 was bumped so many times). When not using priority inheritance mutexes, the current kernel shows this: Task vol nonvol migrated iterations ---- --- ------ -------- ---------- 0: 56 3101 1892 95 1: 594 713 937 95 2: 625 188 618 95 3: 628 4 491 96 4: 640 7 468 96 5: 631 2 501 96 6: 641 1 466 96 7: 643 2 497 96 total: 4458 4018 5870 765 Not much changed with or without priority inheritance mutexes. But if we let the high priority task bump lower priority tasks on wakeup we see: Task vol nonvol migrated iterations ---- --- ------ -------- ---------- 0: 115 3439 2782 98 1: 633 1354 1583 99 2: 652 919 1218 99 3: 645 713 934 99 4: 690 3 3 99 5: 694 1 4 99 6: 720 3 4 99 7: 747 0 1 100 Which shows a even bigger change. The big difference between task 3 and task 4 is because we have only 4 CPUs on the machine, causing the 4 highest prio tasks to always have preference. Although I did not measure cache misses, and I'm sure there would be little to measure since the test was not data intensive, I could imagine large improvements for higher priority tasks when dealing with lower priority tasks. Thus, I'm satisfied with making the change and agreeing with what Gregory Haskins argued a few years ago when we first had this discussion. One final note. All tasks in the above tests were RT tasks. Any RT task will always preempt a non RT task that is running on the CPU the RT task wants to run on. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: Gregory Haskins LKML-Reference: <20100921024138.605460343@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d10c80ebb67a..6a02b38ab653 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -960,18 +960,18 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * runqueue. Otherwise simply start this RT task * on its current runqueue. * - * We want to avoid overloading runqueues. Even if - * the RT task is of higher priority than the current RT task. - * RT tasks behave differently than other tasks. If - * one gets preempted, we try to push it off to another queue. - * So trying to keep a preempting RT task on the same - * cache hot CPU will force the running RT task to - * a cold CPU. So we waste all the cache for the lower - * RT task in hopes of saving some of a RT task - * that is just being woken and probably will have - * cold cache anyway. + * We want to avoid overloading runqueues. If the woken + * task is a higher priority, then it will stay on this CPU + * and the lower prio task should be moved to another CPU. + * Even though this will probably make the lower prio task + * lose its cache, we do not want to bounce a higher task + * around just because it gave up its CPU, perhaps for a + * lock? + * + * For equal prio tasks, we just let the scheduler sort it out. */ if (unlikely(rt_task(rq->curr)) && + rq->curr->prio < p->prio && (p->rt.nr_cpus_allowed > 1)) { int cpu = find_lowest_rq(p); @@ -1491,6 +1491,8 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && + rt_task(rq->curr) && + rq->curr->prio < p->prio && p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } -- cgit v1.2.2 From b3bc211cfe7d5fe94b310480d78e00bea96fbf2a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Sep 2010 22:40:04 -0400 Subject: sched: Give CPU bound RT tasks preference If a high priority task is waking up on a CPU that is running a lower priority task that is bound to a CPU, see if we can move the high RT task to another CPU first. Note, if all other CPUs are running higher priority tasks than the CPU bounded current task, then it will be preempted regardless. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: Gregory Haskins LKML-Reference: <20100921024138.888922071@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6a02b38ab653..baef30f08405 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -971,7 +971,8 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * For equal prio tasks, we just let the scheduler sort it out. */ if (unlikely(rt_task(rq->curr)) && - rq->curr->prio < p->prio && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { int cpu = find_lowest_rq(p); @@ -1491,9 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && + p->rt.nr_cpus_allowed > 1 && rt_task(rq->curr) && - rq->curr->prio < p->prio && - p->rt.nr_cpus_allowed > 1) + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio)) push_rt_tasks(rq); } -- cgit v1.2.2 From a8027073eb127cd207070891374b5c54c2ce3d23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Sep 2010 15:13:34 -0400 Subject: tracing/sched: Add sched_pi_setprio tracepoint Add a tracepoint that shows the priority of a task being boosted via priority inheritance. Cc: Gregory Haskins Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9ca8ad05950b..4ad473814350 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4355,6 +4355,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; -- cgit v1.2.2 From 829f8ed2c963df7c23d1c644db6c4387eb1601fa Mon Sep 17 00:00:00 2001 From: Christian Dietrich Date: Mon, 6 Sep 2010 16:37:12 +0200 Subject: kernel: Remove undead ifdef CONFIG_DEBUG_LOCK_ALLOC The CONFIG_DEBUG_LOCK_ALLOC ifdef isn't necessary at this point, because it is checked in an outer ifdef level already and has no effect here. Signed-off-by: Christian Dietrich Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) int __init_srcu_struct(struct srcu_struct *sp, const char *name, struct lock_class_key *key) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); -- cgit v1.2.2 From 0ddea0ead2ced99eaaaddff2beb755381e5c89f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Sep 2010 21:06:14 -0700 Subject: rcu: fix sparse errors in rcutorture.c Add the sparse __rcu address-space identifier and make a couple of variables static. Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 729710273dcb..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,7 +120,7 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current; +static struct rcu_torture __rcu *rcu_torture_current; static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ static int fullstop = FULLSTOP_RMMOD; -DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ - /* of kthreads. */ +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); /* * Detect and respond to a system shutdown. @@ -737,7 +739,8 @@ rcu_torture_writer(void *arg) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ -- cgit v1.2.2 From 269dcc1c2ec25864308ee03a3fa26ea819d9f5d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Sep 2010 14:23:09 -0700 Subject: rcu: Add tracing data to support queueing models The current tracing data is not sufficient to deduce the average time that a callback spends waiting for a grace period to end. Add three per-CPU counters recording the number of callbacks invoked (ci), the number of callbacks orphaned (co), and the number of callbacks adopted (ca). Given the existing callback queue length (ql), the average wait time in absence of CPU hotplug operations is ql/ci. The units of wait time will be in terms of the duration over which ci was measured. In the presence of CPU hotplug operations, there is room for argument, but ql/(ci-co+ca) won't steer you too far wrong. Also fixes a typo called out by Lucas De Marchi . Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 3 +++ kernel/rcutree.h | 3 +++ kernel/rcutree_trace.c | 10 +++++++--- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 42140a860bb9..e75073504a31 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1004,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } @@ -1025,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; rdp->qlen += rsp->orphan_qlen; + rdp->n_cbs_adopted += rsp->orphan_qlen; rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; @@ -1156,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; + rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7918ba61873f..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,9 @@ struct rcu_data { long qlen; /* # of queued callbacks */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 458e032a3a30..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } #define PRINT_RCU_DATA(name, func, m) \ @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata_csv(struct seq_file *m, void *unused) @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -- cgit v1.2.2 From d1ea13c6e2cce0106531852daaa93dd97aec9580 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2010 18:40:07 +0200 Subject: genirq: Cleanup irq_chip->typename leftovers 3 years transition phase is enough. Cleanup the last users and remove the cruft. Signed-off-by: Thomas Gleixner Cc: Leo Chen Cc: Hirokazu Takata Cc: Chris Metcalf Cc: Jeff Dike Cc: Chris Zankel --- kernel/irq/chip.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5ca2f8..4ea775cc60f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -344,8 +344,6 @@ void irq_chip_set_defaults(struct irq_chip *chip) if (!chip->shutdown) chip->shutdown = chip->disable != default_disable ? chip->disable : default_shutdown; - if (!chip->name) - chip->name = chip->typename; if (!chip->end) chip->end = dummy_irq_chip.end; } -- cgit v1.2.2 From d9817ebeeef16e01487549312c68540ca8f1561b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:59 +0000 Subject: genirq: Provide Kconfig The generic irq Kconfig options are copied around all archs. Provide a generic Kconfig file which can be included. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121843.217333624@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/Kconfig | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 kernel/irq/Kconfig (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 000000000000..e0fc6cd78aa0 --- /dev/null +++ b/kernel/irq/Kconfig @@ -0,0 +1,54 @@ +config HAVE_GENERIC_HARDIRQS + def_bool n + +if HAVE_GENERIC_HARDIRQS +menu "IRQ subsystem" +# +# Interrupt subsystem related configuration options +# +config GENERIC_HARDIRQS + def_bool y + +config GENERIC_HARDIRQS_NO__DO_IRQ + def_bool y + +# Options selectable by the architecture code +config HAVE_SPARSE_IRQ + def_bool n + +config GENERIC_IRQ_PROBE + def_bool n + +config GENERIC_PENDING_IRQ + def_bool n + +if SPARSE_IRQ && NUMA +config NUMA_IRQ_DESC + def_bool n +endif + +config AUTO_IRQ_AFFINITY + def_bool n + +config IRQ_PER_CPU + def_bool n + +config HARDIRQS_SW_RESEND + def_bool n + +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on HAVE_SPARSE_IRQ + ---help--- + + Sparse irq numbering is useful for distro kernels that want + to define a high CONFIG_NR_CPUS value but still want to have + low kernel memory footprint on smaller machines. + + ( Sparse irqs can also be beneficial on NUMA boxes, as they spread + out the interrupt descriptors in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +endmenu +endif -- cgit v1.2.2 From ff7dcd44dd446db2c3e13bdedf2d52b8e0127f16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:25 +0000 Subject: genirq: Create irq_data Low level chip functions need access to irq_desc->handler_data, irq_desc->chip_data and irq_desc->msi_desc. We hand down the irq number to the low level functions, so they need to lookup irq_desc. With sparse irq this means a radix tree lookup. We could hand down irq_desc itself, but low level chip functions have no need to fiddle with it directly and we want to restrict access to irq_desc further. Preparatory patch for new chip functions. Note, that the ugly anon union/struct is there to avoid a full tree wide clean up for now. This is not going to last 3 years like __do_IRQ() Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121841.645542300@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/handle.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c6911223..099d4fc368c3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -75,12 +75,10 @@ EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ static struct irq_desc irq_desc_init = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .status = IRQ_DISABLED, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) @@ -105,7 +103,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); raw_spin_lock_init(&desc->lock); - desc->irq = irq; + desc->irq_data.irq = irq; #ifdef CONFIG_SMP desc->node = node; #endif @@ -151,12 +149,10 @@ void replace_irq_desc(unsigned int irq, struct irq_desc *desc) static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .status = IRQ_DISABLED, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), } }; @@ -183,8 +179,11 @@ int __init early_irq_init(void) kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * sizeof(int), GFP_NOWAIT, node); + irq_desc_init.irq_data.chip = &no_irq_chip; + for (i = 0; i < legacy_count; i++) { - desc[i].irq = i; + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; #ifdef CONFIG_SMP desc[i].node = node; #endif @@ -241,11 +240,10 @@ out_unlock: struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + .status = IRQ_DISABLED, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; @@ -264,7 +262,8 @@ int __init early_irq_init(void) count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { - desc[i].irq = i; + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; alloc_desc_masks(&desc[i], 0, true); init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; -- cgit v1.2.2 From 6b8ff3120c758340505dddf08ad685ebb841d5d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 12:58:38 +0200 Subject: genirq: Convert core code to irq_data Convert all references in the core code to orq, chip, handler_data, chip_data, msi_desc, affinity to irq_data.* Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/autoprobe.c | 14 ++++----- kernel/irq/chip.c | 78 +++++++++++++++++++++++------------------------ kernel/irq/handle.c | 16 +++++----- kernel/irq/internals.h | 12 ++++---- kernel/irq/manage.c | 54 ++++++++++++++++---------------- kernel/irq/migration.c | 10 +++--- kernel/irq/numa_migrate.c | 8 ++--- kernel/irq/proc.c | 8 ++--- kernel/irq/resend.c | 5 +-- kernel/irq/spurious.c | 6 ++-- 10 files changed, 106 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31ef110..f9bf9b228033 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -57,9 +57,9 @@ unsigned long probe_irq_on(void) * Some chips need to know about probing in * progress: */ - if (desc->chip->set_type) - desc->chip->set_type(i, IRQ_TYPE_PROBE); - desc->chip->startup(i); + if (desc->irq_data.chip->set_type) + desc->irq_data.chip->set_type(i, IRQ_TYPE_PROBE); + desc->irq_data.chip->startup(i); } raw_spin_unlock_irq(&desc->lock); } @@ -76,7 +76,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->chip->startup(i)) + if (desc->irq_data.chip->startup(i)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); @@ -98,7 +98,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->shutdown(i); } else if (i < 32) mask |= 1 << i; @@ -137,7 +137,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->shutdown(i); } raw_spin_unlock_irq(&desc->lock); } @@ -181,7 +181,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->shutdown(i); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4ea775cc60f0..e0e93ff10afd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -32,18 +32,18 @@ static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) /* Ensure we don't have left over values from a previous use of this irq */ raw_spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; - desc->chip = &no_irq_chip; + desc->irq_data.chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; desc->depth = 1; - desc->msi_desc = NULL; - desc->handler_data = NULL; + desc->irq_data.msi_desc = NULL; + desc->irq_data.handler_data = NULL; if (!keep_chip_data) - desc->chip_data = NULL; + desc->irq_data.chip_data = NULL; desc->action = NULL; desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpumask_setall(desc->affinity); + cpumask_setall(desc->irq_data.affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -64,7 +64,7 @@ void dynamic_irq_init(unsigned int irq) * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq * @irq: irq number to initialize * - * does not set irq_to_desc(irq)->chip_data to NULL + * does not set irq_to_desc(irq)->irq_data.chip_data to NULL */ void dynamic_irq_init_keep_chip_data(unsigned int irq) { @@ -88,12 +88,12 @@ static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) irq); return; } - desc->msi_desc = NULL; - desc->handler_data = NULL; + desc->irq_data.msi_desc = NULL; + desc->irq_data.handler_data = NULL; if (!keep_chip_data) - desc->chip_data = NULL; + desc->irq_data.chip_data = NULL; desc->handle_irq = handle_bad_irq; - desc->chip = &no_irq_chip; + desc->irq_data.chip = &no_irq_chip; desc->name = NULL; clear_kstat_irqs(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -112,7 +112,7 @@ void dynamic_irq_cleanup(unsigned int irq) * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq * @irq: irq number to initialize * - * does not set irq_to_desc(irq)->chip_data to NULL + * does not set irq_to_desc(irq)->irq_data.chip_data to NULL */ void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) { @@ -140,7 +140,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); - desc->chip = chip; + desc->irq_data.chip = chip; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -193,7 +193,7 @@ int set_irq_data(unsigned int irq, void *data) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->handler_data = data; + desc->irq_data.handler_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -218,7 +218,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->msi_desc = entry; + desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -243,13 +243,13 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - if (!desc->chip) { + if (!desc->irq_data.chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } raw_spin_lock_irqsave(&desc->lock, flags); - desc->chip_data = data; + desc->irq_data.chip_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -291,7 +291,7 @@ static void default_enable(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - desc->chip->unmask(irq); + desc->irq_data.chip->unmask(irq); desc->status &= ~IRQ_MASKED; } @@ -309,7 +309,7 @@ static unsigned int default_startup(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - desc->chip->enable(irq); + desc->irq_data.chip->enable(irq); return 0; } @@ -320,7 +320,7 @@ static void default_shutdown(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - desc->chip->mask(irq); + desc->irq_data.chip->mask(irq); desc->status |= IRQ_MASKED; } @@ -350,28 +350,28 @@ void irq_chip_set_defaults(struct irq_chip *chip) static inline void mask_ack_irq(struct irq_desc *desc, int irq) { - if (desc->chip->mask_ack) - desc->chip->mask_ack(irq); + if (desc->irq_data.chip->mask_ack) + desc->irq_data.chip->mask_ack(irq); else { - desc->chip->mask(irq); - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->mask(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); } desc->status |= IRQ_MASKED; } static inline void mask_irq(struct irq_desc *desc, int irq) { - if (desc->chip->mask) { - desc->chip->mask(irq); + if (desc->irq_data.chip->mask) { + desc->irq_data.chip->mask(irq); desc->status |= IRQ_MASKED; } } static inline void unmask_irq(struct irq_desc *desc, int irq) { - if (desc->chip->unmask) { - desc->chip->unmask(irq); + if (desc->irq_data.chip->unmask) { + desc->irq_data.chip->unmask(irq); desc->status &= ~IRQ_MASKED; } } @@ -552,7 +552,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: - desc->chip->eoi(irq); + desc->irq_data.chip->eoi(irq); raw_spin_unlock(&desc->lock); } @@ -594,8 +594,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -648,15 +648,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) - desc->chip->eoi(irq); + if (desc->irq_data.chip->eoi) + desc->irq_data.chip->eoi(irq); } void @@ -674,7 +674,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) handle = handle_bad_irq; - else if (desc->chip == &no_irq_chip) { + else if (desc->irq_data.chip == &no_irq_chip) { printk(KERN_WARNING "Trying to install %sinterrupt handler " "for IRQ%d\n", is_chained ? "chained " : "", irq); /* @@ -684,7 +684,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, * prevent us to setup the interrupt at all. Switch it to * dummy_irq_chip for easy transition. */ - desc->chip = &dummy_irq_chip; + desc->irq_data.chip = &dummy_irq_chip; } chip_bus_lock(irq, desc); @@ -692,7 +692,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) + if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc, irq); desc->status |= IRQ_DISABLED; desc->depth = 1; @@ -704,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->startup(irq); + desc->irq_data.chip->startup(irq); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 099d4fc368c3..fc27d76e83ef 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -105,7 +105,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) raw_spin_lock_init(&desc->lock); desc->irq_data.irq = irq; #ifdef CONFIG_SMP - desc->node = node; + desc->irq_data.node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_kstat_irqs(desc, node, nr_cpu_ids); @@ -185,7 +185,7 @@ int __init early_irq_init(void) desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; #ifdef CONFIG_SMP - desc[i].node = node; + desc[i].irq_data.node = node; #endif desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -456,20 +456,20 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); } - desc->chip->end(irq); + desc->irq_data.chip->end(irq); return 1; } raw_spin_lock(&desc->lock); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -529,7 +529,7 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - desc->chip->end(irq); + desc->irq_data.chip->end(irq); raw_spin_unlock(&desc->lock); return 1; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc88f0b..a805a00cfd28 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -43,14 +43,14 @@ extern void irq_set_thread_affinity(struct irq_desc *desc); /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) { - if (unlikely(desc->chip->bus_lock)) - desc->chip->bus_lock(irq); + if (unlikely(desc->irq_data.chip->bus_lock)) + desc->irq_data.chip->bus_lock(irq); } static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) { - if (unlikely(desc->chip->bus_sync_unlock)) - desc->chip->bus_sync_unlock(irq); + if (unlikely(desc->irq_data.chip->bus_sync_unlock)) + desc->irq_data.chip->bus_sync_unlock(irq); } /* @@ -67,8 +67,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->chip(): %p, ", desc->chip); - print_symbol("%s\n", (unsigned long)desc->chip); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { printk("->action->handler(): %p, ", desc->action->handler); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c3003e9d91a3..4dfb19521d9f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || - !desc->chip->set_affinity) + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || + !desc->irq_data.chip->set_affinity) return 0; return 1; @@ -111,15 +111,15 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (!desc->chip->set_affinity) + if (!desc->irq_data.chip->set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!desc->irq_data.chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } } @@ -128,8 +128,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->pending_mask, cpumask); } #else - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!desc->irq_data.chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } #endif @@ -168,16 +168,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, desc->affinity); + desc->irq_data.chip->set_affinity(irq, desc->irq_data.affinity); return 0; } @@ -223,7 +223,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->chip->disable(irq); + desc->irq_data.chip->disable(irq); } } @@ -313,7 +313,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * IRQ line is re-enabled. * * This function may be called from IRQ context only when - * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -336,8 +336,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; - if (desc->chip->set_wake) - ret = desc->chip->set_wake(irq, on); + if (desc->irq_data.chip->set_wake) + ret = desc->irq_data.chip->set_wake(irq, on); return ret; } @@ -432,7 +432,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { int ret; - struct irq_chip *chip = desc->chip; + struct irq_chip *chip = desc->irq_data.chip; if (!chip || !chip->set_type) { /* @@ -457,8 +457,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; - if (chip != desc->chip) - irq_chip_set_defaults(desc->chip); + if (chip != desc->irq_data.chip) + irq_chip_set_defaults(desc->irq_data.chip); } return ret; @@ -528,7 +528,7 @@ again: if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; - desc->chip->unmask(irq); + desc->irq_data.chip->unmask(irq); } raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(irq, desc); @@ -556,7 +556,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->affinity); + cpumask_copy(mask, desc->irq_data.affinity); raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); @@ -657,7 +657,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!desc) return -EINVAL; - if (desc->chip == &no_irq_chip) + if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, @@ -752,7 +752,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - irq_chip_set_defaults(desc->chip); + irq_chip_set_defaults(desc->irq_data.chip); init_waitqueue_head(&desc->wait_for_threads); @@ -779,7 +779,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - desc->chip->startup(irq); + desc->irq_data.chip->startup(irq); } else /* Undo nested disables: */ desc->depth = 1; @@ -912,17 +912,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); + if (desc->irq_data.chip->release) + desc->irq_data.chip->release(irq, dev_id); #endif /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); + if (desc->irq_data.chip->shutdown) + desc->irq_data.chip->shutdown(irq); else - desc->chip->disable(irq); + desc->irq_data.chip->disable(irq); } #ifdef CONFIG_SMP diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 241962280836..f923c37e651a 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -24,7 +24,7 @@ void move_masked_irq(int irq) if (unlikely(cpumask_empty(desc->pending_mask))) return; - if (!desc->chip->set_affinity) + if (!desc->irq_data.chip->set_affinity) return; assert_raw_spin_locked(&desc->lock); @@ -43,8 +43,8 @@ void move_masked_irq(int irq) */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) - if (!desc->chip->set_affinity(irq, desc->pending_mask)) { - cpumask_copy(desc->affinity, desc->pending_mask); + if (!desc->irq_data.chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->irq_data.affinity, desc->pending_mask); irq_set_thread_affinity(desc); } @@ -61,8 +61,8 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + desc->irq_data.chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + desc->irq_data.chip->unmask(irq); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 65d3845665ac..e7f1f16402c1 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -44,7 +44,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, return false; } raw_spin_lock_init(&desc->lock); - desc->node = node; + desc->irq_data.node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); @@ -66,7 +66,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, unsigned int irq; unsigned long flags; - irq = old_desc->irq; + irq = old_desc->irq_data.irq; raw_spin_lock_irqsave(&sparse_irq_lock, flags); @@ -109,10 +109,10 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { /* those static or target node is -1, do not move them */ - if (desc->irq < NR_IRQS_LEGACY || node == -1) + if (desc->irq_data.irq < NR_IRQS_LEGACY || node == -1) return desc; - if (desc->node != node) + if (desc->irq_data.node != node) desc = __real_move_irq_desc(desc, node); return desc; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd2..9b0da94b5b2b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->affinity; + const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->irq_data.chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); - seq_printf(m, "%d\n", desc->node); + seq_printf(m, "%d\n", desc->irq_data.node); return 0; } @@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c3763f3a2..47c56a097928 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * Make sure the interrupt is enabled, before resending it: */ - desc->chip->enable(irq); + desc->irq_data.chip->enable(irq); /* * We do not resend level type interrupts. Level type @@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { + if (!desc->irq_data.chip->retrigger || + !desc->irq_data.chip->retrigger(irq)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90ae534f..36c2c9289e2b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -78,8 +78,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(irq); + if (work && desc->irq_data.chip && desc->irq_data.chip->end) + desc->irq_data.chip->end(irq); raw_spin_unlock(&desc->lock); return ok; @@ -254,7 +254,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->chip->disable(irq); + desc->irq_data.chip->disable(irq); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); -- cgit v1.2.2 From a77c4635915021c646cc017f22239e66d1aab4d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 14:44:58 +0200 Subject: genirq: Add new functions to dummy chips The compat functions go away when the core code is converted. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/handle.c | 59 +++++++++++++++++++++++++++++++++++--------------- kernel/irq/internals.h | 2 ++ 2 files changed, 44 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fc27d76e83ef..adca5b4b40d8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -291,7 +291,15 @@ void clear_kstat_irqs(struct irq_desc *desc) * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. */ -static void ack_bad(unsigned int irq) +static void ack_bad(struct irq_data *data) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + print_irq_desc(data->irq, desc); + ack_bad_irq(data->irq); +} + +static void compat_ack_bad(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -302,11 +310,16 @@ static void ack_bad(unsigned int irq) /* * NOP functions */ -static void noop(unsigned int irq) +static void noop(struct irq_data *data) { } + +static unsigned int noop_ret(struct irq_data *data) { + return 0; } -static unsigned int noop_ret(unsigned int irq) +static void compat_noop(unsigned int irq) { } + +static unsigned int compat_noop_ret(unsigned int irq) { return 0; } @@ -316,12 +329,17 @@ static unsigned int noop_ret(unsigned int irq) */ struct irq_chip no_irq_chip = { .name = "none", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = ack_bad, - .end = noop, + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = ack_bad, + .startup = compat_noop_ret, + .shutdown = compat_noop, + .enable = compat_noop, + .disable = compat_noop, + .ack = compat_ack_bad, + .end = compat_noop, }; /* @@ -330,14 +348,21 @@ struct irq_chip no_irq_chip = { */ struct irq_chip dummy_irq_chip = { .name = "dummy", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = noop, - .mask = noop, - .unmask = noop, - .end = noop, + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + .startup = compat_noop_ret, + .shutdown = compat_noop, + .enable = compat_noop, + .disable = compat_noop, + .ack = compat_noop, + .mask = compat_noop, + .unmask = compat_noop, + .end = compat_noop, }; /* diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a805a00cfd28..562fc7eeabec 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -4,6 +4,8 @@ extern int noirqdebug; +#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) + /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); -- cgit v1.2.2 From 3876ec9ef3775d062345b3760d3271ecb8cd3fea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:35 +0000 Subject: genirq: Provide compat handling for bus_lock/bus_sync_unlock Wrap the old chip functions for bus_lock/bus_sync_unlock until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121841.842536121@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 20 ++++++++++++++++++-- kernel/irq/internals.h | 12 ++++++------ kernel/irq/manage.c | 22 +++++++++++----------- 3 files changed, 35 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e0e93ff10afd..77e551d92239 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -324,6 +324,17 @@ static void default_shutdown(unsigned int irq) desc->status |= IRQ_MASKED; } +/* Temporary migration helpers */ +static void compat_bus_lock(struct irq_data *data) +{ + data->chip->bus_lock(data->irq); +} + +static void compat_bus_sync_unlock(struct irq_data *data) +{ + data->chip->bus_sync_unlock(data->irq); +} + /* * Fixup enable/disable function pointers */ @@ -346,6 +357,11 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->disable : default_shutdown; if (!chip->end) chip->end = dummy_irq_chip.end; + + if (chip->bus_lock) + chip->irq_bus_lock = compat_bus_lock; + if (chip->bus_sync_unlock) + chip->irq_bus_sync_unlock = compat_bus_sync_unlock; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) @@ -687,7 +703,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->irq_data.chip = &dummy_irq_chip; } - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ @@ -707,7 +723,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->irq_data.chip->startup(irq); } raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 562fc7eeabec..ecafbfee5b12 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -43,16 +43,16 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); /* Inline functions for support of irq chips on slow busses */ -static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_lock(struct irq_desc *desc) { - if (unlikely(desc->irq_data.chip->bus_lock)) - desc->irq_data.chip->bus_lock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_lock)) + desc->irq_data.chip->irq_bus_lock(&desc->irq_data); } -static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_sync_unlock(struct irq_desc *desc) { - if (unlikely(desc->irq_data.chip->bus_sync_unlock)) - desc->irq_data.chip->bus_sync_unlock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) + desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4dfb19521d9f..dfb02ff7d2ef 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -246,11 +246,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -323,11 +323,11 @@ void enable_irq(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(enable_irq); @@ -507,7 +507,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { again: - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* @@ -521,7 +521,7 @@ again: */ if (unlikely(desc->status & IRQ_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); cpu_relax(); goto again; } @@ -531,7 +531,7 @@ again: desc->irq_data.chip->unmask(irq); } raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP @@ -997,9 +997,9 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); @@ -1086,9 +1086,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); if (retval) kfree(action); -- cgit v1.2.2 From e2c0f8ff0fc26959952fbfa89f732fef928df77f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:42 +0000 Subject: genirq: Provide compat handling for chip->mask() Wrap the old chip function mask() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121841.940355859@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 22 +++++++++++++++------- kernel/irq/handle.c | 1 - kernel/irq/migration.c | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 77e551d92239..c041270bfe50 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -320,11 +320,16 @@ static void default_shutdown(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - desc->irq_data.chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } /* Temporary migration helpers */ +static void compat_irq_mask(struct irq_data *data) +{ + data->chip->mask(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -362,6 +367,9 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) chip->irq_bus_sync_unlock = compat_bus_sync_unlock; + + if (chip->mask) + chip->irq_mask = compat_irq_mask; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) @@ -369,17 +377,17 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) if (desc->irq_data.chip->mask_ack) desc->irq_data.chip->mask_ack(irq); else { - desc->irq_data.chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); if (desc->irq_data.chip->ack) desc->irq_data.chip->ack(irq); } desc->status |= IRQ_MASKED; } -static inline void mask_irq(struct irq_desc *desc, int irq) +static inline void mask_irq(struct irq_desc *desc) { - if (desc->irq_data.chip->mask) { - desc->irq_data.chip->mask(irq); + if (desc->irq_data.chip->irq_mask) { + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } } @@ -553,7 +561,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - mask_irq(desc, irq); + mask_irq(desc); goto out; } @@ -621,7 +629,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - mask_irq(desc, irq); + mask_irq(desc); goto out_unlock; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index adca5b4b40d8..3b160ac236b0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -360,7 +360,6 @@ struct irq_chip dummy_irq_chip = { .enable = compat_noop, .disable = compat_noop, .ack = compat_noop, - .mask = compat_noop, .unmask = compat_noop, .end = compat_noop, }; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index f923c37e651a..b165ec26b757 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -61,7 +61,7 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->irq_data.chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); desc->irq_data.chip->unmask(irq); } -- cgit v1.2.2 From 0eda58b7f3a30c9a13d83db1cfaab00e1c452055 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:44 +0000 Subject: genirq: Provide compat handling for chip->unmask() Wrap the old chip function unmask() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.043608928@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 20 ++++++++++++++------ kernel/irq/handle.c | 1 - kernel/irq/manage.c | 2 +- kernel/irq/migration.c | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c041270bfe50..dbdb59a42093 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -291,7 +291,7 @@ static void default_enable(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - desc->irq_data.chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } @@ -330,6 +330,11 @@ static void compat_irq_mask(struct irq_data *data) data->chip->mask(data->irq); } +static void compat_irq_unmask(struct irq_data *data) +{ + data->chip->unmask(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -370,6 +375,9 @@ void irq_chip_set_defaults(struct irq_chip *chip) if (chip->mask) chip->irq_mask = compat_irq_mask; + + if (chip->unmask) + chip->irq_unmask = compat_irq_unmask; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) @@ -392,10 +400,10 @@ static inline void mask_irq(struct irq_desc *desc) } } -static inline void unmask_irq(struct irq_desc *desc, int irq) +static inline void unmask_irq(struct irq_desc *desc) { - if (desc->irq_data.chip->unmask) { - desc->irq_data.chip->unmask(irq); + if (desc->irq_data.chip->irq_unmask) { + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } } @@ -524,7 +532,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) - unmask_irq(desc, irq); + unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } @@ -641,7 +649,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc, irq); + unmask_irq(desc); } desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3b160ac236b0..f334c8c59dd2 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -360,7 +360,6 @@ struct irq_chip dummy_irq_chip = { .enable = compat_noop, .disable = compat_noop, .ack = compat_noop, - .unmask = compat_noop, .end = compat_noop, }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dfb02ff7d2ef..b3986bce64ff 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -528,7 +528,7 @@ again: if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; - desc->irq_data.chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index b165ec26b757..7888e5d5575a 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -63,6 +63,6 @@ void move_native_irq(int irq) desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); - desc->irq_data.chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } -- cgit v1.2.2 From 22a49163e90dd7088f801dd54e25b04e1f337e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:47 +0000 Subject: genirq: Provide compat handling for chip->ack() Wrap the old chip function ack() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.142624725@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 20 ++++++++++++-------- kernel/irq/handle.c | 10 ---------- 2 files changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dbdb59a42093..864c3abdf8f4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -335,6 +335,11 @@ static void compat_irq_unmask(struct irq_data *data) data->chip->unmask(data->irq); } +static void compat_irq_ack(struct irq_data *data) +{ + data->chip->ack(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -372,12 +377,12 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) chip->irq_bus_sync_unlock = compat_bus_sync_unlock; - if (chip->mask) chip->irq_mask = compat_irq_mask; - if (chip->unmask) chip->irq_unmask = compat_irq_unmask; + if (chip->ack) + chip->irq_ack = compat_irq_ack; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) @@ -386,8 +391,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) desc->irq_data.chip->mask_ack(irq); else { desc->irq_data.chip->irq_mask(&desc->irq_data); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); } desc->status |= IRQ_MASKED; } @@ -626,8 +631,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); + desc->irq_data.chip->irq_ack(&desc->irq_data); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -680,8 +684,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f334c8c59dd2..9ba7aece0e4d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -299,14 +299,6 @@ static void ack_bad(struct irq_data *data) ack_bad_irq(data->irq); } -static void compat_ack_bad(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - print_irq_desc(irq, desc); - ack_bad_irq(irq); -} - /* * NOP functions */ @@ -338,7 +330,6 @@ struct irq_chip no_irq_chip = { .shutdown = compat_noop, .enable = compat_noop, .disable = compat_noop, - .ack = compat_ack_bad, .end = compat_noop, }; @@ -359,7 +350,6 @@ struct irq_chip dummy_irq_chip = { .shutdown = compat_noop, .enable = compat_noop, .disable = compat_noop, - .ack = compat_noop, .end = compat_noop, }; -- cgit v1.2.2 From 9205e31d1af0f725e71bbae10d199c6b9e8d6dd8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:50 +0000 Subject: genirq: Provide compat handling for chip->mask_ack() Wrap the old chip function mask_ack() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.240806983@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 864c3abdf8f4..09c1a4493440 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -340,6 +340,11 @@ static void compat_irq_ack(struct irq_data *data) data->chip->ack(data->irq); } +static void compat_irq_mask_ack(struct irq_data *data) +{ + data->chip->mask_ack(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -383,12 +388,14 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_unmask = compat_irq_unmask; if (chip->ack) chip->irq_ack = compat_irq_ack; + if (chip->mask_ack) + chip->irq_mask_ack = compat_irq_mask_ack; } -static inline void mask_ack_irq(struct irq_desc *desc, int irq) +static inline void mask_ack_irq(struct irq_desc *desc) { - if (desc->irq_data.chip->mask_ack) - desc->irq_data.chip->mask_ack(irq); + if (desc->irq_data.chip->irq_mask_ack) + desc->irq_data.chip->irq_mask_ack(&desc->irq_data); else { desc->irq_data.chip->irq_mask(&desc->irq_data); if (desc->irq_data.chip->irq_ack) @@ -511,7 +518,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; raw_spin_lock(&desc->lock); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -625,7 +632,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -729,7 +736,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) - mask_ack_irq(desc, irq); + mask_ack_irq(desc); desc->status |= IRQ_DISABLED; desc->depth = 1; } -- cgit v1.2.2 From 0c5c15572ac096001f52d37b416f2a4be9aebb80 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:53 +0000 Subject: genirq: Provide compat handling for chip->eoi() Wrap the old chip function eoi() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.339657617@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 09c1a4493440..c8648a83b80a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -345,6 +345,11 @@ static void compat_irq_mask_ack(struct irq_data *data) data->chip->mask_ack(data->irq); } +static void compat_irq_eoi(struct irq_data *data) +{ + data->chip->eoi(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -390,6 +395,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_ack = compat_irq_ack; if (chip->mask_ack) chip->irq_mask_ack = compat_irq_mask_ack; + if (chip->eoi) + chip->irq_eoi = compat_irq_eoi; } static inline void mask_ack_irq(struct irq_desc *desc) @@ -596,7 +603,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: - desc->irq_data.chip->eoi(irq); + desc->irq_data.chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } @@ -698,8 +705,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->irq_data.chip->eoi) - desc->irq_data.chip->eoi(irq); + if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(&desc->irq_data); } void -- cgit v1.2.2 From c5f756344c390f629243b4a28c2bd198fdfd7ee9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:56 +0000 Subject: genirq: Provide compat handling for chip->enable() Wrap the old chip function enable() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.437159182@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 25 ++++++++++++++++++++----- kernel/irq/handle.c | 2 -- kernel/irq/resend.c | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c8648a83b80a..a95b47831269 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -287,9 +287,9 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); /* * default enable function */ -static void default_enable(unsigned int irq) +static void default_enable(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; @@ -309,7 +309,7 @@ static unsigned int default_startup(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - desc->irq_data.chip->enable(irq); + desc->irq_data.chip->irq_enable(&desc->irq_data); return 0; } @@ -350,6 +350,11 @@ static void compat_irq_eoi(struct irq_data *data) data->chip->eoi(data->irq); } +static void compat_irq_enable(struct irq_data *data) +{ + data->chip->enable(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -365,8 +370,18 @@ static void compat_bus_sync_unlock(struct irq_data *data) */ void irq_chip_set_defaults(struct irq_chip *chip) { - if (!chip->enable) - chip->enable = default_enable; + /* + * Compat fixup functions need to be before we set the + * defaults for enable/disable/startup/shutdown + */ + if (chip->enable) + chip->irq_enable = compat_irq_enable; + + /* + * The real defaults + */ + if (!chip->irq_enable) + chip->irq_enable = default_enable; if (!chip->disable) chip->disable = default_disable; if (!chip->startup) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9ba7aece0e4d..ac8c749dfee5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -328,7 +328,6 @@ struct irq_chip no_irq_chip = { .irq_ack = ack_bad, .startup = compat_noop_ret, .shutdown = compat_noop, - .enable = compat_noop, .disable = compat_noop, .end = compat_noop, }; @@ -348,7 +347,6 @@ struct irq_chip dummy_irq_chip = { .irq_unmask = noop, .startup = compat_noop_ret, .shutdown = compat_noop, - .enable = compat_noop, .disable = compat_noop, .end = compat_noop, }; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 47c56a097928..a798a2328f8a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * Make sure the interrupt is enabled, before resending it: */ - desc->irq_data.chip->enable(irq); + desc->irq_data.chip->irq_enable(&desc->irq_data); /* * We do not resend level type interrupts. Level type -- cgit v1.2.2 From bc310dda41be6439364c8f3b9fe7c9d743d22b1c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:02 +0000 Subject: genirq: Provide compat handling for chip->disable()/shutdown() Wrap the old chip functions disable() and shutdown() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.532070631@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/autoprobe.c | 6 +++--- kernel/irq/chip.c | 37 +++++++++++++++++++++++++++---------- kernel/irq/handle.c | 4 ---- kernel/irq/manage.c | 8 ++++---- kernel/irq/spurious.c | 2 +- 5 files changed, 35 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index f9bf9b228033..95806a45be78 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -98,7 +98,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } else if (i < 32) mask |= 1 << i; @@ -137,7 +137,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -181,7 +181,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a95b47831269..b8a47f0a26cc 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -298,7 +298,7 @@ static void default_enable(struct irq_data *data) /* * default disable function */ -static void default_disable(unsigned int irq) +static void default_disable(struct irq_data *data) { } @@ -316,9 +316,9 @@ static unsigned int default_startup(unsigned int irq) /* * default shutdown function */ -static void default_shutdown(unsigned int irq) +static void default_shutdown(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; @@ -355,6 +355,16 @@ static void compat_irq_enable(struct irq_data *data) data->chip->enable(data->irq); } +static void compat_irq_disable(struct irq_data *data) +{ + data->chip->disable(data->irq); +} + +static void compat_irq_shutdown(struct irq_data *data) +{ + data->chip->shutdown(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -376,28 +386,35 @@ void irq_chip_set_defaults(struct irq_chip *chip) */ if (chip->enable) chip->irq_enable = compat_irq_enable; + if (chip->disable) + chip->irq_disable = compat_irq_disable; + if (chip->shutdown) + chip->irq_shutdown = compat_irq_shutdown; /* * The real defaults */ if (!chip->irq_enable) chip->irq_enable = default_enable; - if (!chip->disable) - chip->disable = default_disable; + if (!chip->irq_disable) + chip->irq_disable = default_disable; if (!chip->startup) chip->startup = default_startup; /* - * We use chip->disable, when the user provided its own. When - * we have default_disable set for chip->disable, then we need + * We use chip->irq_disable, when the user provided its own. When + * we have default_disable set for chip->irq_disable, then we need * to use default_shutdown, otherwise the irq line is not * disabled on free_irq(): */ - if (!chip->shutdown) - chip->shutdown = chip->disable != default_disable ? - chip->disable : default_shutdown; + if (!chip->irq_shutdown) + chip->irq_shutdown = chip->irq_disable != default_disable ? + chip->irq_disable : default_shutdown; if (!chip->end) chip->end = dummy_irq_chip.end; + /* + * Now fix up the remaining compat handlers + */ if (chip->bus_lock) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ac8c749dfee5..60e25c46eb55 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -327,8 +327,6 @@ struct irq_chip no_irq_chip = { .irq_disable = noop, .irq_ack = ack_bad, .startup = compat_noop_ret, - .shutdown = compat_noop, - .disable = compat_noop, .end = compat_noop, }; @@ -346,8 +344,6 @@ struct irq_chip dummy_irq_chip = { .irq_mask = noop, .irq_unmask = noop, .startup = compat_noop_ret, - .shutdown = compat_noop, - .disable = compat_noop, .end = compat_noop, }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b3986bce64ff..f3f36f6af9a1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -223,7 +223,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->irq_data.chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } } @@ -919,10 +919,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->irq_data.chip->shutdown) - desc->irq_data.chip->shutdown(irq); + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); else - desc->irq_data.chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } #ifdef CONFIG_SMP diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 36c2c9289e2b..9ee704d3a23c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -254,7 +254,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->irq_data.chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); -- cgit v1.2.2 From 37e12df709f09eac17314d79a52190ac46746e33 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:38 +0000 Subject: genirq: Provide compat handling for chip->startup() Wrap the old chip function startup() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.635152961@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 19 +++++++++++++------ kernel/irq/handle.c | 7 ------- kernel/irq/manage.c | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 95806a45be78..7a468254e533 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -59,7 +59,7 @@ unsigned long probe_irq_on(void) */ if (desc->irq_data.chip->set_type) desc->irq_data.chip->set_type(i, IRQ_TYPE_PROBE); - desc->irq_data.chip->startup(i); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -76,7 +76,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->irq_data.chip->startup(i)) + if (desc->irq_data.chip->irq_startup(&desc->irq_data)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b8a47f0a26cc..cce85f0734b0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -305,11 +305,11 @@ static void default_disable(struct irq_data *data) /* * default startup function */ -static unsigned int default_startup(unsigned int irq) +static unsigned int default_startup(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->irq_data.chip->irq_enable(&desc->irq_data); + desc->irq_data.chip->irq_enable(data); return 0; } @@ -365,6 +365,11 @@ static void compat_irq_shutdown(struct irq_data *data) data->chip->shutdown(data->irq); } +static unsigned int compat_irq_startup(struct irq_data *data) +{ + return data->chip->startup(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -390,6 +395,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_disable = compat_irq_disable; if (chip->shutdown) chip->irq_shutdown = compat_irq_shutdown; + if (chip->startup) + chip->irq_startup = compat_irq_startup; /* * The real defaults @@ -398,8 +405,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_enable = default_enable; if (!chip->irq_disable) chip->irq_disable = default_disable; - if (!chip->startup) - chip->startup = default_startup; + if (!chip->irq_startup) + chip->irq_startup = default_startup; /* * We use chip->irq_disable, when the user provided its own. When * we have default_disable set for chip->irq_disable, then we need @@ -786,7 +793,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->irq_data.chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 60e25c46eb55..8d0697f892a2 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -311,11 +311,6 @@ static unsigned int noop_ret(struct irq_data *data) static void compat_noop(unsigned int irq) { } -static unsigned int compat_noop_ret(unsigned int irq) -{ - return 0; -} - /* * Generic no controller implementation */ @@ -326,7 +321,6 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - .startup = compat_noop_ret, .end = compat_noop, }; @@ -343,7 +337,6 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - .startup = compat_noop_ret, .end = compat_noop, }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f3f36f6af9a1..31d7678e0269 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -779,7 +779,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - desc->irq_data.chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } else /* Undo nested disables: */ desc->depth = 1; -- cgit v1.2.2 From c96b3b3c448592a0b87ef20306deb8b1fb4878c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:41 +0000 Subject: genirq: Provide compat handling for chip->set_affinity() Wrap the old chip function set_affinity() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.732894108@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 8 ++++++++ kernel/irq/manage.c | 11 ++++++----- kernel/irq/migration.c | 6 ++++-- kernel/irq/proc.c | 2 +- 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cce85f0734b0..df51792d9fd3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -370,6 +370,12 @@ static unsigned int compat_irq_startup(struct irq_data *data) return data->chip->startup(data->irq); } +static int compat_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + return data->chip->set_affinity(data->irq, dest); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -436,6 +442,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_mask_ack = compat_irq_mask_ack; if (chip->eoi) chip->irq_eoi = compat_irq_eoi; + if (chip->set_affinity) + chip->irq_set_affinity = compat_irq_set_affinity; } static inline void mask_ack_irq(struct irq_desc *desc) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 31d7678e0269..305a60ff756b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -74,7 +74,7 @@ int irq_can_set_affinity(unsigned int irq) struct irq_desc *desc = irq_to_desc(irq); if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || - !desc->irq_data.chip->set_affinity) + !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -109,16 +109,17 @@ void irq_set_thread_affinity(struct irq_desc *desc) int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; - if (!desc->irq_data.chip->set_affinity) + if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { - if (!desc->irq_data.chip->set_affinity(irq, cpumask)) { + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } @@ -128,7 +129,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->pending_mask, cpumask); } #else - if (!desc->irq_data.chip->set_affinity(irq, cpumask)) { + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } @@ -177,7 +178,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->irq_data.chip->set_affinity(irq, desc->irq_data.affinity); + desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 7888e5d5575a..1d2541940480 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,6 +7,7 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -24,7 +25,7 @@ void move_masked_irq(int irq) if (unlikely(cpumask_empty(desc->pending_mask))) return; - if (!desc->irq_data.chip->set_affinity) + if (!chip->irq_set_affinity) return; assert_raw_spin_locked(&desc->lock); @@ -43,7 +44,8 @@ void move_masked_irq(int irq) */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) - if (!desc->irq_data.chip->set_affinity(irq, desc->pending_mask)) { + if (!chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false)) { cpumask_copy(desc->irq_data.affinity, desc->pending_mask); irq_set_thread_affinity(desc); } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9b0da94b5b2b..d9fddf918b41 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->irq_data.chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; -- cgit v1.2.2 From b2ba2c30033c10cca2454f8b44bf98f5249e61c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:47 +0000 Subject: genirq: Provide compat handling for chip->set_type() Wrap the old chip function set_type() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.832261548@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/autoprobe.c | 5 +++-- kernel/irq/chip.c | 7 +++++++ kernel/irq/manage.c | 10 +++++----- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 7a468254e533..505798f86c36 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -57,8 +57,9 @@ unsigned long probe_irq_on(void) * Some chips need to know about probing in * progress: */ - if (desc->irq_data.chip->set_type) - desc->irq_data.chip->set_type(i, IRQ_TYPE_PROBE); + if (desc->irq_data.chip->irq_set_type) + desc->irq_data.chip->irq_set_type(&desc->irq_data, + IRQ_TYPE_PROBE); desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index df51792d9fd3..b7dd02a99c80 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -376,6 +376,11 @@ static int compat_irq_set_affinity(struct irq_data *data, return data->chip->set_affinity(data->irq, dest); } +static int compat_irq_set_type(struct irq_data *data, unsigned int type) +{ + return data->chip->set_type(data->irq, type); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -444,6 +449,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_eoi = compat_irq_eoi; if (chip->set_affinity) chip->irq_set_affinity = compat_irq_set_affinity; + if (chip->set_type) + chip->irq_set_type = compat_irq_set_type; } static inline void mask_ack_irq(struct irq_desc *desc) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 305a60ff756b..3618362b3d8d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -430,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) + unsigned long flags) { int ret; struct irq_chip *chip = desc->irq_data.chip; - if (!chip || !chip->set_type) { + if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? @@ -446,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, } /* caller masked out all except trigger mode flags */ - ret = chip->set_type(irq, flags); + ret = chip->irq_set_type(&desc->irq_data, flags); if (ret) - pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)flags, irq, chip->set_type); + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); else { if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) flags |= IRQ_LEVEL; -- cgit v1.2.2 From 2f7e99bb9be6a2d8d7b808dc86037710cc8b7bf1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:50 +0000 Subject: genirq: Provide compat handling for chip->set_wake() Wrap the old chip function set_wake() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121842.927527393@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 7 +++++++ kernel/irq/manage.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7dd02a99c80..8775dd39ab3d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -381,6 +381,11 @@ static int compat_irq_set_type(struct irq_data *data, unsigned int type) return data->chip->set_type(data->irq, type); } +static int compat_irq_set_wake(struct irq_data *data, unsigned int on) +{ + return data->chip->set_wake(data->irq, on); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -451,6 +456,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_set_affinity = compat_irq_set_affinity; if (chip->set_type) chip->irq_set_type = compat_irq_set_type; + if (chip->set_wake) + chip->irq_set_wake = compat_irq_set_wake; } static inline void mask_ack_irq(struct irq_desc *desc) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3618362b3d8d..644e8d5fa367 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -337,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; - if (desc->irq_data.chip->set_wake) - ret = desc->irq_data.chip->set_wake(irq, on); + if (desc->irq_data.chip->irq_set_wake) + ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } -- cgit v1.2.2 From 21e2b8c62cca8f7dbec0c8c131ca1637e4a5670f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:45:53 +0000 Subject: genirq: Provide compat handling for chip->retrigger() Wrap the old chip function retrigger() until the migration is complete and the old chip functions are removed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121843.025801092@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 7 +++++++ kernel/irq/resend.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8775dd39ab3d..f2c4d28c508a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -386,6 +386,11 @@ static int compat_irq_set_wake(struct irq_data *data, unsigned int on) return data->chip->set_wake(data->irq, on); } +static int compat_irq_retrigger(struct irq_data *data) +{ + return data->chip->retrigger(data->irq); +} + static void compat_bus_lock(struct irq_data *data) { data->chip->bus_lock(data->irq); @@ -458,6 +463,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_set_type = compat_irq_set_type; if (chip->set_wake) chip->irq_set_wake = compat_irq_set_wake; + if (chip->retrigger) + chip->irq_retrigger = compat_irq_retrigger; } static inline void mask_ack_irq(struct irq_desc *desc) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index a798a2328f8a..891115a929aa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -70,8 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->irq_data.chip->retrigger || - !desc->irq_data.chip->retrigger(irq)) { + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); -- cgit v1.2.2 From bd151412263a67b5321e9dd1d5b4bf6d96fdebf3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 15:17:14 +0200 Subject: genirq: Provide config option to disable deprecated code This option covers now the old chip functions and the irq_desc data fields which are moving to struct irq_data. More stuff will follow. Pretty handy for testing a conversion, whether something broke or not. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/Kconfig | 4 ++++ kernel/irq/chip.c | 8 +++++++- kernel/irq/handle.c | 9 +++++++-- kernel/irq/internals.h | 10 ++++++++++ kernel/irq/spurious.c | 6 ++++-- 5 files changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index e0fc6cd78aa0..a42c0191d71a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -12,6 +12,10 @@ config GENERIC_HARDIRQS config GENERIC_HARDIRQS_NO__DO_IRQ def_bool y +# Select this to disable the deprecated stuff +config GENERIC_HARDIRQS_NO_DEPRECATED + def_bool n + # Options selectable by the architecture code config HAVE_SPARSE_IRQ def_bool n diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f2c4d28c508a..323547983f15 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -324,6 +324,7 @@ static void default_shutdown(struct irq_data *data) desc->status |= IRQ_MASKED; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED /* Temporary migration helpers */ static void compat_irq_mask(struct irq_data *data) { @@ -400,12 +401,14 @@ static void compat_bus_sync_unlock(struct irq_data *data) { data->chip->bus_sync_unlock(data->irq); } +#endif /* * Fixup enable/disable function pointers */ void irq_chip_set_defaults(struct irq_chip *chip) { +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED /* * Compat fixup functions need to be before we set the * defaults for enable/disable/startup/shutdown @@ -418,7 +421,7 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_shutdown = compat_irq_shutdown; if (chip->startup) chip->irq_startup = compat_irq_startup; - +#endif /* * The real defaults */ @@ -437,6 +440,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) if (!chip->irq_shutdown) chip->irq_shutdown = chip->irq_disable != default_disable ? chip->irq_disable : default_shutdown; + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; @@ -465,6 +470,7 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_set_wake = compat_irq_set_wake; if (chip->retrigger) chip->irq_retrigger = compat_irq_retrigger; +#endif } static inline void mask_ack_irq(struct irq_desc *desc) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 8d0697f892a2..3fcef37154a1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -309,7 +309,12 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED static void compat_noop(unsigned int irq) { } +#define END_INIT .end = compat_noop +#else +#define END_INIT +#endif /* * Generic no controller implementation @@ -321,7 +326,7 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - .end = compat_noop, + END_INIT }; /* @@ -337,7 +342,7 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - .end = compat_noop, + END_INIT }; /* diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ecafbfee5b12..b905f0ab1bb2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,6 +42,16 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static inline void irq_end(unsigned int irq, struct irq_desc *desc) +{ + if (desc->irq_data.chip && desc->irq_data.chip->end) + desc->irq_data.chip->end(irq); +} +#else +static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } +#endif + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9ee704d3a23c..3089d3b9d5f3 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -14,6 +14,8 @@ #include #include +#include "internals.h" + static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) @@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work && desc->irq_data.chip && desc->irq_data.chip->end) - desc->irq_data.chip->end(irq); + if (work) + irq_end(irq, desc); raw_spin_unlock(&desc->lock); return ok; -- cgit v1.2.2 From 773e3f93577ffb493fb7c39b1a6ecf39b5748e87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Oct 2010 14:03:02 -0700 Subject: rcu: move check from rcu_dereference_bh to rcu_read_lock_bh_held As suggested by Linus, push the irqs_disabled() down to the rcu_read_lock_bh_held() level so that all callers get the benefit of the correct check. Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..0af1dc70fece 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -86,7 +86,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - return in_softirq(); + return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); -- cgit v1.2.2 From b0a0f667a349247bd7f05f806b662a25653822bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Oct 2010 17:32:51 -0700 Subject: sched: suppress RCU lockdep splat in task_fork_fair > =================================================== > [ INFO: suspicious rcu_dereference_check() usage. ] > --------------------------------------------------- > /home/greearb/git/linux.wireless-testing/kernel/sched.c:618 invoked rcu_dereference_check() without protection! > > other info that might help us debug this: > > rcu_scheduler_active = 1, debug_locks = 1 > 1 lock held by ifup/23517: > #0: (&rq->lock){-.-.-.}, at: [] task_fork_fair+0x3b/0x108 > > stack backtrace: > Pid: 23517, comm: ifup Not tainted 2.6.36-rc6-wl+ #5 > Call Trace: > [] ? printk+0xf/0x16 > [] lockdep_rcu_dereference+0x74/0x7d > [] task_group+0x6d/0x79 > [] set_task_rq+0xe/0x57 > [] task_fork_fair+0x57/0x108 > [] sched_fork+0x82/0xf9 > [] copy_process+0x569/0xe8e > [] do_fork+0x118/0x262 > [] ? do_page_fault+0x16a/0x2cf > [] ? up_read+0x16/0x2a > [] sys_clone+0x1b/0x20 > [] ptregs_clone+0x15/0x30 > [] ? sysenter_do_call+0x12/0x38 Here a newly created task is having its runqueue assigned. The new task is not yet on the tasklist, so cannot go away. This is therefore a false positive, suppress with an RCU read-side critical section. Reported-by: Ben Greear Tested-by: Ben Greear Date: Tue, 28 Sep 2010 16:32:43 +0800 Subject: rcu: using ACCESS_ONCE() to observe the jiffies_stall/rnp->qsmask value Using ACCESS_ONCE() to observe the jiffies_stall/rnp->qsmask value due to the caller didn't hold the root_rcu/rnp node's lock. Although use without ACCESS_ONCE() is safe due to the value loaded being used but once, the ACCESS_ONCE() is a good documentation aid -- the variables are being loaded without the services of a lock. Signed-off-by: Dongdong Deng CC: Dipankar Sarma CC: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e75073504a31..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -545,9 +545,9 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_cpu_stall_suppress) return; - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); -- cgit v1.2.2 From 6506cf6ce68d78a5470a8360c965dafe8e4b78e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Sep 2010 17:50:31 +0200 Subject: sched: fix RCU lockdep splat from task_group() This addresses the following RCU lockdep splat: [0.051203] CPU0: AMD QEMU Virtual CPU version 0.12.4 stepping 03 [0.052999] lockdep: fixing up alternatives. [0.054105] [0.054106] =================================================== [0.054999] [ INFO: suspicious rcu_dereference_check() usage. ] [0.054999] --------------------------------------------------- [0.054999] kernel/sched.c:616 invoked rcu_dereference_check() without protection! [0.054999] [0.054999] other info that might help us debug this: [0.054999] [0.054999] [0.054999] rcu_scheduler_active = 1, debug_locks = 1 [0.054999] 3 locks held by swapper/1: [0.054999] #0: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_up+0x42/0x6a [0.054999] #1: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2a/0x51 [0.054999] #2: (&rq->lock){-.-...}, at: [] init_idle+0x2f/0x113 [0.054999] [0.054999] stack backtrace: [0.054999] Pid: 1, comm: swapper Not tainted 2.6.35 #1 [0.054999] Call Trace: [0.054999] [] lockdep_rcu_dereference+0x9b/0xa3 [0.054999] [] task_group+0x7b/0x8a [0.054999] [] set_task_rq+0x13/0x40 [0.054999] [] init_idle+0xd2/0x113 [0.054999] [] fork_idle+0xb8/0xc7 [0.054999] [] ? mark_held_locks+0x4d/0x6b [0.054999] [] do_fork_idle+0x17/0x2b [0.054999] [] native_cpu_up+0x1c1/0x724 [0.054999] [] ? do_fork_idle+0x0/0x2b [0.054999] [] _cpu_up+0xac/0x127 [0.054999] [] cpu_up+0x55/0x6a [0.054999] [] kernel_init+0xe1/0x1ff [0.054999] [] kernel_thread_helper+0x4/0x10 [0.054999] [] ? restore_args+0x0/0x30 [0.054999] [] ? kernel_init+0x0/0x1ff [0.054999] [] ? kernel_thread_helper+0x0/0x10 [0.056074] Booting Node 0, Processors #1lockdep: fixing up alternatives. [0.130045] #2lockdep: fixing up alternatives. [0.203089] #3 Ok. [0.275286] Brought up 4 CPUs [0.276005] Total of 4 processors activated (16017.17 BogoMIPS). The cgroup_subsys_state structures referenced by idle tasks are never freed, because the idle tasks should be part of the root cgroup, which is not removable. The problem is that while we do in-fact hold rq->lock, the newly spawned idle thread's cpu is not yet set to the correct cpu so the lockdep check in task_group(): lockdep_is_held(&task_rq(p)->lock) will fail. But this is a chicken and egg problem. Setting the CPU's runqueue requires that the CPU's runqueue already be set. ;-) So insert an RCU read-side critical section to avoid the complaint. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..ae8f75a5ceb4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5337,7 +5337,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) -- cgit v1.2.2 From e144710b302525de5b90b9c3ba43562458d8957f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 16:03:45 +0200 Subject: genirq: Distangle irq.h Move irq_desc and internal functions out of irq.h Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/internals.h | 100 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b905f0ab1bb2..e281e45fbb55 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,6 +1,7 @@ /* * IRQ subsystem internal functions and variables: */ +#include extern int noirqdebug; @@ -22,6 +23,9 @@ extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern raw_spinlock_t sparse_irq_lock; +/* Resending of interrupts :*/ +void check_irq_resend(struct irq_desc *desc, unsigned int irq); + #ifdef CONFIG_SPARSE_IRQ void replace_irq_desc(unsigned int irq, struct irq_desc *desc); #endif @@ -105,3 +109,99 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) #undef P +/* Stuff below will be cleaned up after the sparse allocator is done */ + +#ifdef CONFIG_SMP +/** + * alloc_desc_masks - allocate cpumasks for irq_desc + * @desc: pointer to irq_desc struct + * @node: node which will be handling the cpumasks + * @boot: true if need bootmem + * + * Allocates affinity and pending_mask cpumask if required. + * Returns true if successful (or not required). + */ +static inline bool alloc_desc_masks(struct irq_desc *desc, int node, + bool boot) +{ + gfp_t gfp = GFP_ATOMIC; + + if (boot) + gfp = GFP_NOWAIT; + +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!alloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + return false; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + free_cpumask_var(desc->irq_data.affinity); + return false; + } +#endif +#endif + return true; +} + +static inline void init_desc_masks(struct irq_desc *desc) +{ + cpumask_setall(desc->irq_data.affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + +/** + * init_copy_desc_masks - copy cpumasks for irq_desc + * @old_desc: pointer to old irq_desc struct + * @new_desc: pointer to new irq_desc struct + * + * Insures affinity and pending_masks are copied to new irq_desc. + * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the + * irq_desc struct so the copy is redundant. + */ + +static inline void init_copy_desc_masks(struct irq_desc *old_desc, + struct irq_desc *new_desc) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK + cpumask_copy(new_desc->irq_data.affinity, old_desc->irq_data.affinity); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); +#endif +#endif +} + +static inline void free_desc_masks(struct irq_desc *old_desc, + struct irq_desc *new_desc) +{ + free_cpumask_var(old_desc->irq_data.affinity); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(old_desc->pending_mask); +#endif +} + +#else /* !CONFIG_SMP */ + +static inline bool alloc_desc_masks(struct irq_desc *desc, int node, + bool boot) +{ + return true; +} + +static inline void init_desc_masks(struct irq_desc *desc) +{ +} + +static inline void init_copy_desc_masks(struct irq_desc *old_desc, + struct irq_desc *new_desc) +{ +} + +static inline void free_desc_masks(struct irq_desc *old_desc, + struct irq_desc *new_desc) +{ +} +#endif /* CONFIG_SMP */ -- cgit v1.2.2 From 442471848f5abb55b99cba1229301655f67492b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 10:40:18 +0200 Subject: genirq: Provide status modifier Provide a irq_desc.status modifier function to cleanup the direct access to irq_desc in arch and driver code. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 323547983f15..2b1f6906b824 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -851,32 +851,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void set_irq_noprobe(unsigned int irq) +void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); + if (!desc) return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_NOPROBE; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} -void set_irq_probe(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; - } + /* Sanitize flags */ + set &= IRQF_MODIFY_MASK; + clr &= IRQF_MODIFY_MASK; raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~IRQ_NOPROBE; + desc->status &= ~clr; + desc->status |= set; raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.2.2 From f303a6dd127b5ec6de90d1cd79ed19820c7e9658 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:34:01 +0200 Subject: genirq: Sanitize irq_data accessors Get the data structure from the core and provide inline wrappers to access the irq_data members. Provide accessor inlines for irq_data as well. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2b1f6906b824..659be326c8e8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -256,6 +256,14 @@ int set_irq_chip_data(unsigned int irq, void *data) } EXPORT_SYMBOL(set_irq_chip_data); +struct irq_data *irq_get_irq_data(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc ? &desc->irq_data : NULL; +} +EXPORT_SYMBOL_GPL(irq_get_irq_data); + /** * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq * -- cgit v1.2.2 From 3795de236d67a05994a1a12759db9d4dd9ffc42c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Sep 2010 17:09:43 +0200 Subject: genirq: Distangle kernel/irq/handle.c kernel/irq/handle.c has become a dumpground for random code in random order. Split out the irq descriptor management and the dummy irq_chip implementation into separate files. Cleanup the include maze while at it. No code change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/Makefile | 2 +- kernel/irq/dummychip.c | 68 ++++++++++ kernel/irq/handle.c | 333 +------------------------------------------------ kernel/irq/irqdesc.c | 269 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 340 insertions(+), 332 deletions(-) create mode 100644 kernel/irq/dummychip.c create mode 100644 kernel/irq/irqdesc.c (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d047808419d..1eaab0da56db 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,5 @@ -obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 000000000000..918dea9de9ea --- /dev/null +++ b/kernel/irq/dummychip.c @@ -0,0 +1,68 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the dummy interrupt chip implementation + */ +#include +#include + +#include "internals.h" + +/* + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. + */ +static void ack_bad(struct irq_data *data) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + print_irq_desc(data->irq, desc); + ack_bad_irq(data->irq); +} + +/* + * NOP functions + */ +static void noop(struct irq_data *data) { } + +static unsigned int noop_ret(struct irq_data *data) +{ + return 0; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_CRUFT +static void compat_noop(unsigned int irq) { } +#define END_INIT .end = compat_noop +#else +#define END_INIT +#endif + +/* + * Generic no controller implementation + */ +struct irq_chip no_irq_chip = { + .name = "none", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = ack_bad, + END_INIT +}; + +/* + * Generic dummy implementation which can be used for + * real dumb interrupt sources + */ +struct irq_chip dummy_irq_chip = { + .name = "dummy", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + END_INIT +}; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3fcef37154a1..e2347eb63306 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,24 +11,15 @@ */ #include -#include -#include -#include #include +#include #include #include -#include -#include -#include + #include #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -43,308 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) ack_bad_irq(irq); } -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -static void __init init_irq_default_affinity(void) -{ - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); -} -#else -static void __init init_irq_default_affinity(void) -{ -} -#endif - -/* - * Linux has a controller-independent interrupt architecture. - * Every controller has a 'controller-template', that is used - * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the appropriate - * controller. Thus drivers need not be aware of the - * interrupt-controller. - * - * The code is designed to be easily extended with new/different - * interrupt controllers, without having to do assembly magic or - * having to touch the generic code. - * - * Controller mappings for all interrupt sources: - */ -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); - -#ifdef CONFIG_SPARSE_IRQ - -static struct irq_desc irq_desc_init = { - .status = IRQ_DISABLED, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -}; - -void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) -{ - void *ptr; - - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - - /* - * don't overwite if can not get new one - * init_copy_kstat_irqs() could still use old one - */ - if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); - desc->kstat_irqs = ptr; - } -} - -static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - - raw_spin_lock_init(&desc->lock); - desc->irq_data.irq = irq; -#ifdef CONFIG_SMP - desc->irq_data.node = node; -#endif - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, node, nr_cpu_ids); - if (!desc->kstat_irqs) { - printk(KERN_ERR "can not alloc kstat_irqs\n"); - BUG_ON(1); - } - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } - init_desc_masks(desc); - arch_init_chip_data(desc, node); -} - -/* - * Protect the sparse_irqs: - */ -DEFINE_RAW_SPINLOCK(sparse_irq_lock); - -static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); - -static void set_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return radix_tree_lookup(&irq_desc_tree, irq); -} - -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - void **ptr; - - ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); - if (ptr) - radix_tree_replace_slot(ptr, desc); -} - -static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS_LEGACY-1] = { - .status = IRQ_DISABLED, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), - } -}; - -static unsigned int *kstat_irqs_legacy; - -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int legacy_count; - int node; - int i; - - init_irq_default_affinity(); - - /* initialize nr_irqs based on nr_cpu_ids */ - arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); - - desc = irq_desc_legacy; - legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; - - /* allocate based on nr_cpu_ids */ - kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int), GFP_NOWAIT, node); - - irq_desc_init.irq_data.chip = &no_irq_chip; - - for (i = 0; i < legacy_count; i++) { - desc[i].irq_data.irq = i; - desc[i].irq_data.chip = &no_irq_chip; -#ifdef CONFIG_SMP - desc[i].irq_data.node = node; -#endif - desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], node, true); - init_desc_masks(&desc[i]); - set_irq_desc(i, &desc[i]); - } - - return arch_early_irq_init(); -} - -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= nr_irqs) { - WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", - irq, nr_irqs); - return NULL; - } - - desc = irq_to_desc(irq); - if (desc) - return desc; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - if (desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - - printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); - if (!desc) { - printk(KERN_ERR "can not alloc irq_desc\n"); - BUG_ON(1); - } - init_one_irq_desc(irq, desc, node); - - set_irq_desc(irq, desc); - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -#else /* !CONFIG_SPARSE_IRQ */ - -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), - } -}; - -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int count; - int i; - - init_irq_default_affinity(); - - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); - - desc = irq_desc; - count = ARRAY_SIZE(irq_desc); - - for (i = 0; i < count; i++) { - desc[i].irq_data.irq = i; - desc[i].irq_data.chip = &no_irq_chip; - alloc_desc_masks(&desc[i], 0, true); - init_desc_masks(&desc[i]); - desc[i].kstat_irqs = kstat_irqs_all[i]; - } - return arch_early_irq_init(); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return (irq < NR_IRQS) ? irq_desc + irq : NULL; -} - -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} -#endif /* !CONFIG_SPARSE_IRQ */ - -void clear_kstat_irqs(struct irq_desc *desc) -{ - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); -} - -/* - * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. - */ -static void ack_bad(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - print_irq_desc(data->irq, desc); - ack_bad_irq(data->irq); -} - -/* - * NOP functions - */ -static void noop(struct irq_data *data) { } - -static unsigned int noop_ret(struct irq_data *data) -{ - return 0; -} - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static void compat_noop(unsigned int irq) { } -#define END_INIT .end = compat_noop -#else -#define END_INIT -#endif - -/* - * Generic no controller implementation - */ -struct irq_chip no_irq_chip = { - .name = "none", - .irq_startup = noop_ret, - .irq_shutdown = noop, - .irq_enable = noop, - .irq_disable = noop, - .irq_ack = ack_bad, - END_INIT -}; - -/* - * Generic dummy implementation which can be used for - * real dumb interrupt sources - */ -struct irq_chip dummy_irq_chip = { - .name = "dummy", - .irq_startup = noop_ret, - .irq_shutdown = noop, - .irq_enable = noop, - .irq_disable = noop, - .irq_ack = noop, - .irq_mask = noop, - .irq_unmask = noop, - END_INIT -}; - /* * Special, empty irq handler: */ @@ -540,21 +229,3 @@ out: return 1; } #endif - -void early_init_irq_lock_class(void) -{ - struct irq_desc *desc; - int i; - - for_each_irq_desc(i, desc) { - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - } -} - -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; -} -EXPORT_SYMBOL(kstat_irqs_cpu); - diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 000000000000..fbf8cfa00510 --- /dev/null +++ b/kernel/irq/irqdesc.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the interrupt descriptor management code + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +struct lock_class_key irq_desc_lock_class; + +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +static void __init init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpumask_setall(irq_default_affinity); +} +#else +static void __init init_irq_default_affinity(void) +{ +} +#endif + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + +#ifdef CONFIG_SPARSE_IRQ + +static struct irq_desc irq_desc_init = { + .status = IRQ_DISABLED, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +}; + +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) +{ + void *ptr; + + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + + /* + * don't overwite if can not get new one + * init_copy_kstat_irqs() could still use old one + */ + if (ptr) { + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); + desc->kstat_irqs = ptr; + } +} + +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) +{ + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); + + raw_spin_lock_init(&desc->lock); + desc->irq_data.irq = irq; +#ifdef CONFIG_SMP + desc->irq_data.node = node; +#endif + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_kstat_irqs(desc, node, nr_cpu_ids); + if (!desc->kstat_irqs) { + printk(KERN_ERR "can not alloc kstat_irqs\n"); + BUG_ON(1); + } + if (!alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } + init_desc_masks(desc); + arch_init_chip_data(desc, node); +} + +/* + * Protect the sparse_irqs: + */ +DEFINE_RAW_SPINLOCK(sparse_irq_lock); + +static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); + +static void set_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +void replace_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + void **ptr; + + ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); + if (ptr) + radix_tree_replace_slot(ptr, desc); +} + +static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS_LEGACY-1] = { + .status = IRQ_DISABLED, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + } +}; + +static unsigned int *kstat_irqs_legacy; + +int __init early_irq_init(void) +{ + struct irq_desc *desc; + int legacy_count; + int node; + int i; + + init_irq_default_affinity(); + + /* initialize nr_irqs based on nr_cpu_ids */ + arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); + + desc = irq_desc_legacy; + legacy_count = ARRAY_SIZE(irq_desc_legacy); + node = first_online_node; + + /* allocate based on nr_cpu_ids */ + kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int), GFP_NOWAIT, node); + + irq_desc_init.irq_data.chip = &no_irq_chip; + + for (i = 0; i < legacy_count; i++) { + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; +#ifdef CONFIG_SMP + desc[i].irq_data.node = node; +#endif + desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + alloc_desc_masks(&desc[i], node, true); + init_desc_masks(&desc[i]); + set_irq_desc(i, &desc[i]); + } + + return arch_early_irq_init(); +} + +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= nr_irqs) { + WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", + irq, nr_irqs); + return NULL; + } + + desc = irq_to_desc(irq); + if (desc) + return desc; + + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + + /* We have to check it to avoid races with another CPU */ + desc = irq_to_desc(irq); + if (desc) + goto out_unlock; + + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); + if (!desc) { + printk(KERN_ERR "can not alloc irq_desc\n"); + BUG_ON(1); + } + init_one_irq_desc(irq, desc, node); + + set_irq_desc(irq, desc); + +out_unlock: + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + + return desc; +} + +#else /* !CONFIG_SPARSE_IRQ */ + +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS-1] = { + .status = IRQ_DISABLED, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + } +}; + +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; +int __init early_irq_init(void) +{ + struct irq_desc *desc; + int count; + int i; + + init_irq_default_affinity(); + + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) { + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); + desc[i].kstat_irqs = kstat_irqs_all[i]; + } + return arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) +{ + return irq_to_desc(irq); +} +#endif /* !CONFIG_SPARSE_IRQ */ + +void clear_kstat_irqs(struct irq_desc *desc) +{ + memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); +} + +void early_init_irq_lock_class(void) +{ + struct irq_desc *desc; + int i; + + for_each_irq_desc(i, desc) { + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + } +} + +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc ? desc->kstat_irqs[cpu] : 0; +} +EXPORT_SYMBOL(kstat_irqs_cpu); -- cgit v1.2.2 From 154cd387cdf0e5566ce523cbddf92dd2a062dfd6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Sep 2010 15:58:45 +0200 Subject: genirq: Remove early_init_irq_lock_class() early_init_irq_lock_class() is called way before anything touches the irq descriptors. In case of SPARSE_IRQ=y this is a NOP operation because the radix tree is empty at this point. For the SPARSE_IRQ=n case it's sufficient to set the lock class in early_init_irq(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index fbf8cfa00510..0a7a0908afbc 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -231,6 +231,7 @@ int __init early_irq_init(void) alloc_desc_masks(&desc[i], 0, true); init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); } return arch_early_irq_init(); } @@ -251,16 +252,6 @@ void clear_kstat_irqs(struct irq_desc *desc) memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); } -void early_init_irq_lock_class(void) -{ - struct irq_desc *desc; - int i; - - for_each_irq_desc(i, desc) { - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - } -} - unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); -- cgit v1.2.2 From d895f51ebb54cefe367bda135fcf2cd734d51d03 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 17:45:49 +0200 Subject: genirq: Remove export of kstat_irqs_cpu The statistics accessor is only used by proc/stats and show_interrupts(). Both are compiled in. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0a7a0908afbc..78ff426a6cb7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -257,4 +257,3 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) struct irq_desc *desc = irq_to_desc(irq); return desc ? desc->kstat_irqs[cpu] : 0; } -EXPORT_SYMBOL(kstat_irqs_cpu); -- cgit v1.2.2 From 1318a481fc37c503a901b96ae06b692ca2b21af5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 21:01:37 +0200 Subject: genirq: Provide default irq init flags Arch code sets it's own irq_desc.status flags right after boot and for dynamically allocated interrupts. That might involve iterating over a huge array. Allow ARCH_IRQ_INIT_FLAGS to set separate flags aside of IRQ_DISABLED which is the default. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 2 +- kernel/irq/irqdesc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 659be326c8e8..3405761d6224 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -31,7 +31,7 @@ static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) /* Ensure we don't have left over values from a previous use of this irq */ raw_spin_lock_irqsave(&desc->lock, flags); - desc->status = IRQ_DISABLED; + desc->status = IRQ_DEFAULT_INIT_FLAGS; desc->irq_data.chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; desc->depth = 1; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 78ff426a6cb7..29963f99f24d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ static struct irq_desc irq_desc_init = { - .status = IRQ_DISABLED, + .status = IRQ_DEFAULT_INIT_FLAGS, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), @@ -113,7 +113,7 @@ void replace_irq_desc(unsigned int irq, struct irq_desc *desc) static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { - .status = IRQ_DISABLED, + .status = IRQ_DEFAULT_INIT_FLAGS, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), @@ -204,7 +204,7 @@ out_unlock: struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, + .status = IRQ_DEFAULT_INIT_FLAGS, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), -- cgit v1.2.2 From 1f5a5b87f78fade3ae48dfd55e8765d1d622ea4e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 17:48:26 +0200 Subject: genirq: Implement a sane sparse_irq allocator The current sparse_irq allocator has several short comings due to failures in the design or the lack of it: - Requires iteration over the number of active irqs to find a free slot (Some architectures have grown their own workarounds for this) - Removal of entries is not possible - Racy between create_irq_nr and destroy_irq (plugged by horrible callbacks) - Migration of active irq descriptors is not possible - No bulk allocation of irq ranges - Sprinkeled irq_desc references all over the place outside of kernel/irq/ (The previous chip functions series is addressing this issue) Implement a sane allocator which fixes the above short comings (though migration of active descriptors needs a full tree wide cleanup of the direct and mostly unlocked access to irq_desc). The new allocator still uses a radix_tree, but uses a bitmap for keeping track of allocated irq numbers. That allows: - Fast lookup of a free slot - Allows the removal of descriptors - Prevents the create/destroy race - Bulk allocation of consecutive irq ranges - Basic design is ready for migration of life descriptors after further cleanups The bitmap is also used in the SPARSE_IRQ=n case for lookup and raceless (de)allocation of irq numbers. So it removes the requirement for looping through the descriptor array to find slots. Right now it uses sparse_irq_lock to protect the bitmap and the radix tree, but after cleaning up all users we should be able convert that to a mutex and to switch the radix_tree and decriptor allocations to GFP_KERNEL. [ Folded in a bugfix from Yinghai Lu ] Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 223 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 29963f99f24d..4eea48b4f576 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internals.h" @@ -33,9 +34,54 @@ static void __init init_irq_default_affinity(void) } #endif +#ifdef CONFIG_SMP +static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +{ + if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + return -ENOMEM; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + free_cpumask_var(desc->irq_data.affinity); + return -ENOMEM; + } +#endif + return 0; +} + +static void desc_smp_init(struct irq_desc *desc, int node) +{ + desc->node = node; + cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +} + +#else +static inline int +alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +static inline void desc_smp_init(struct irq_desc *desc, int node) { } +#endif + +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +{ + desc->irq_data.irq = irq; + desc->irq_data.chip = &no_irq_chip; + desc->irq_data.chip_data = NULL; + desc->irq_data.handler_data = NULL; + desc->irq_data.msi_desc = NULL; + desc->status = IRQ_DEFAULT_INIT_FLAGS; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->name = NULL; + memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + desc_smp_init(desc, node); +} + int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); +DEFINE_RAW_SPINLOCK(sparse_irq_lock); +static DECLARE_BITMAP(allocated_irqs, NR_IRQS); + #ifdef CONFIG_SPARSE_IRQ static struct irq_desc irq_desc_init = { @@ -85,14 +131,9 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) arch_init_chip_data(desc, node); } -/* - * Protect the sparse_irqs: - */ -DEFINE_RAW_SPINLOCK(sparse_irq_lock); - static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); -static void set_irq_desc(unsigned int irq, struct irq_desc *desc) +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { radix_tree_insert(&irq_desc_tree, irq, desc); } @@ -111,6 +152,94 @@ void replace_irq_desc(unsigned int irq, struct irq_desc *desc) radix_tree_replace_slot(ptr, desc); } +static void delete_irq_desc(unsigned int irq) +{ + radix_tree_delete(&irq_desc_tree, irq); +} + +#ifdef CONFIG_SMP +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->affinity); +} +#else +static inline void free_masks(struct irq_desc *desc) { } +#endif + +static struct irq_desc *alloc_desc(int irq, int node) +{ + struct irq_desc *desc; + gfp_t gfp = GFP_KERNEL; + + desc = kzalloc_node(sizeof(*desc), gfp, node); + if (!desc) + return NULL; + /* allocate based on nr_cpu_ids */ + desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), + gfp, node); + if (!desc->kstat_irqs) + goto err_desc; + + if (alloc_masks(desc, gfp, node)) + goto err_kstat; + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + + desc_set_defaults(irq, desc, node); + + return desc; + +err_kstat: + kfree(desc->kstat_irqs); +err_desc: + kfree(desc); + return NULL; +} + +static void free_desc(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + delete_irq_desc(irq); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + + free_masks(desc); + kfree(desc->kstat_irqs); + kfree(desc); +} + +static int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + struct irq_desc *desc; + unsigned long flags; + int i; + + for (i = 0; i < cnt; i++) { + desc = alloc_desc(start + i, node); + if (!desc) + goto err; + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + irq_insert_desc(start + i, desc); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + } + return start; + +err: + for (i--; i >= 0; i--) + free_desc(start + i); + + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + bitmap_clear(allocated_irqs, start, cnt); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + return -ENOMEM; +} + static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { .status = IRQ_DEFAULT_INIT_FLAGS, @@ -155,7 +284,7 @@ int __init early_irq_init(void) lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); alloc_desc_masks(&desc[i], node, true); init_desc_masks(&desc[i]); - set_irq_desc(i, &desc[i]); + irq_insert_desc(i, &desc[i]); } return arch_early_irq_init(); @@ -192,7 +321,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) } init_one_irq_desc(irq, desc, node); - set_irq_desc(irq, desc); + irq_insert_desc(irq, desc); out_unlock: raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); @@ -245,8 +374,94 @@ struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } + +#ifdef CONFIG_SMP +static inline int desc_node(struct irq_desc *desc) +{ + return desc->irq_data.node; +} +#else +static inline int desc_node(struct irq_desc *desc) { return 0; } +#endif + +static void free_desc(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); +} + +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + return start; +} #endif /* !CONFIG_SPARSE_IRQ */ +/* Dynamic interrupt handling */ + +/** + * irq_free_descs - free irq descriptors + * @from: Start of descriptor range + * @cnt: Number of consecutive irqs to free + */ +void irq_free_descs(unsigned int from, unsigned int cnt) +{ + unsigned long flags; + int i; + + if (from >= nr_irqs || (from + cnt) > nr_irqs) + return; + + for (i = 0; i < cnt; i++) + free_desc(from + i); + + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + bitmap_clear(allocated_irqs, from, cnt); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); +} + +/** + * irq_alloc_descs - allocate and initialize a range of irq descriptors + * @irq: Allocate for specific irq number if irq >= 0 + * @from: Start the search from this irq number + * @cnt: Number of consecutive irqs to allocate. + * @node: Preferred node on which the irq descriptor should be allocated + * + * Returns the first irq number or error code + */ +int __ref +irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +{ + unsigned long flags; + int start, ret; + + if (!cnt) + return -EINVAL; + + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + ret = -EEXIST; + if (irq >=0 && start != irq) + goto err; + + ret = -ENOMEM; + if (start >= nr_irqs) + goto err; + + bitmap_set(allocated_irqs, start, cnt); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + return alloc_descs(start, cnt, node); + +err: + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + return ret; +} + +/* Statistics access */ void clear_kstat_irqs(struct irq_desc *desc) { memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); -- cgit v1.2.2 From 13bfe99e09123ef5edb6acb81ba337d2db600b53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2010 02:46:07 +0200 Subject: genirq: Prepare proc for real sparse irq support /proc/irq never removes any entries, but when irq descriptors can be freed for real this is necessary. Otherwise we'd reference a freed descriptor in /proc/irq/N Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/internals.h | 2 ++ kernel/irq/irqdesc.c | 2 ++ kernel/irq/proc.c | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e281e45fbb55..8eb01e379ccc 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -32,10 +32,12 @@ void replace_irq_desc(unsigned int irq, struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); +extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } +static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4eea48b4f576..6312a2c83971 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -205,6 +205,8 @@ static void free_desc(unsigned int irq) struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + unregister_irq_proc(irq, desc); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); delete_irq_desc(irq); raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d9fddf918b41..01b1d3a88983 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_spurious_proc_fops, (void *)(long)irq); } +void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || !desc->dir) + return; +#ifdef CONFIG_SMP + remove_proc_entry("smp_affinity", desc->dir); + remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("node", desc->dir); +#endif + remove_proc_entry("spurious", desc->dir); + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%u", irq); + remove_proc_entry(name, root_irq_dir); +} + #undef MAX_NAMELEN void unregister_handler_proc(unsigned int irq, struct irqaction *action) -- cgit v1.2.2 From a98d24b71b6e229965f18dc00d28dc71cb8fe324 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2010 10:45:07 +0200 Subject: genirq: Implement sane enumeration Use the allocator bitmap to lookup active interrupts. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6312a2c83971..2e7e94ef64da 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -463,6 +463,17 @@ err: return ret; } +/** + * irq_get_next_irq - get next allocated irq number + * @offset: where to start the search + * + * Returns next irq number after offset or nr_irqs if none is found. + */ +unsigned int irq_get_next_irq(unsigned int offset) +{ + return find_next_bit(allocated_irqs, nr_irqs, offset); +} + /* Statistics access */ void clear_kstat_irqs(struct irq_desc *desc) { -- cgit v1.2.2 From 06f6c3399e9f9ff6eafc200e80f9226c3cee0eaf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Oct 2010 12:31:46 +0200 Subject: genirq: Implement irq reservation Mark a range of interrupts as allocated. In the SPARSE_IRQ=n case we need this to update the bitmap for the legacy irqs so the enumerator via irq_get_next_irq() works. Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2e7e94ef64da..35d9052901b9 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -463,6 +463,32 @@ err: return ret; } +/** + * irq_reserve_irqs - mark irqs allocated + * @from: mark from irq number + * @cnt: number of irqs to mark + * + * Returns 0 on success or an appropriate error code + */ +int irq_reserve_irqs(unsigned int from, unsigned int cnt) +{ + unsigned long flags; + unsigned int start; + int ret = 0; + + if (!cnt || (from + cnt) > nr_irqs) + return -EINVAL; + + raw_spin_lock_irqsave(&sparse_irq_lock, flags); + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + if (start == from) + bitmap_set(allocated_irqs, start, cnt); + else + ret = -EEXIST; + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + return ret; +} + /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search -- cgit v1.2.2 From aa99ec0f3f26bf2bcd0fa5176de93598427f1e5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 20:02:56 +0200 Subject: genirq: Use sane sparse allocator Make irq_to_desc_alloc_node() a wrapper around the new allocator. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 129 ++++++++------------------------------------------- 1 file changed, 20 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 35d9052901b9..7cbe4f93e2fb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -51,7 +51,7 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - desc->node = node; + desc->irq_data.node = node; cpumask_copy(desc->irq_data.affinity, irq_default_affinity); } @@ -84,13 +84,6 @@ static DECLARE_BITMAP(allocated_irqs, NR_IRQS); #ifdef CONFIG_SPARSE_IRQ -static struct irq_desc irq_desc_init = { - .status = IRQ_DEFAULT_INIT_FLAGS, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -}; - void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { void *ptr; @@ -108,29 +101,6 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - - raw_spin_lock_init(&desc->lock); - desc->irq_data.irq = irq; -#ifdef CONFIG_SMP - desc->irq_data.node = node; -#endif - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, node, nr_cpu_ids); - if (!desc->kstat_irqs) { - printk(KERN_ERR "can not alloc kstat_irqs\n"); - BUG_ON(1); - } - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } - init_desc_masks(desc); - arch_init_chip_data(desc, node); -} - static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -171,8 +141,9 @@ static inline void free_masks(struct irq_desc *desc) { } static struct irq_desc *alloc_desc(int irq, int node) { + /* Temporary hack until we can switch to GFP_KERNEL */ + gfp_t gfp = gfp_allowed_mask == GFP_BOOT_MASK ? GFP_NOWAIT : GFP_ATOMIC; struct irq_desc *desc; - gfp_t gfp = GFP_KERNEL; desc = kzalloc_node(sizeof(*desc), gfp, node); if (!desc) @@ -226,6 +197,8 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node) desc = alloc_desc(start + i, node); if (!desc) goto err; + /* temporary until I fixed x86 madness */ + arch_init_chip_data(desc, node); raw_spin_lock_irqsave(&sparse_irq_lock, flags); irq_insert_desc(start + i, desc); raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); @@ -242,23 +215,19 @@ err: return -ENOMEM; } -static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS_LEGACY-1] = { - .status = IRQ_DEFAULT_INIT_FLAGS, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), - } -}; +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) +{ + int res = irq_alloc_descs(irq, irq, 1, node); -static unsigned int *kstat_irqs_legacy; + if (res == -EEXIST || res == irq) + return irq_to_desc(irq); + return NULL; +} int __init early_irq_init(void) { + int i, node = first_online_node; struct irq_desc *desc; - int legacy_count; - int node; - int i; init_irq_default_affinity(); @@ -266,71 +235,14 @@ int __init early_irq_init(void) arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); - desc = irq_desc_legacy; - legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; - - /* allocate based on nr_cpu_ids */ - kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int), GFP_NOWAIT, node); - - irq_desc_init.irq_data.chip = &no_irq_chip; - - for (i = 0; i < legacy_count; i++) { - desc[i].irq_data.irq = i; - desc[i].irq_data.chip = &no_irq_chip; -#ifdef CONFIG_SMP - desc[i].irq_data.node = node; -#endif - desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], node, true); - init_desc_masks(&desc[i]); - irq_insert_desc(i, &desc[i]); + for (i = 0; i < NR_IRQS_LEGACY; i++) { + desc = alloc_desc(i, node); + set_bit(i, allocated_irqs); + irq_insert_desc(i, desc); } - return arch_early_irq_init(); } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= nr_irqs) { - WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", - irq, nr_irqs); - return NULL; - } - - desc = irq_to_desc(irq); - if (desc) - return desc; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - if (desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - - printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); - if (!desc) { - printk(KERN_ERR "can not alloc irq_desc\n"); - BUG_ON(1); - } - init_one_irq_desc(irq, desc, node); - - irq_insert_desc(irq, desc); - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - #else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { @@ -345,9 +257,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { + int count, i, node = first_online_node; struct irq_desc *desc; - int count; - int i; init_irq_default_affinity(); @@ -359,9 +270,9 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - alloc_desc_masks(&desc[i], 0, true); - init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; + alloc_masks(desc + i, GFP_KERNEL, node); + desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); } return arch_early_irq_init(); -- cgit v1.2.2 From b683de2b3cb17bb10fa6fd4af614dc75b5749fe0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 20:55:03 +0200 Subject: genirq: Query arch for number of early descriptors sparse irq sets up NR_IRQS_LEGACY irq descriptors and archs then go ahead and allocate more. Use the unused return value of arch_probe_nr_irqs() to let the architecture return the number of early allocations. Fix up all users. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 10 +++++----- kernel/softirq.c | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7cbe4f93e2fb..a1fbd1d347af 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -226,16 +226,16 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) int __init early_irq_init(void) { - int i, node = first_online_node; + int i, initcnt, node = first_online_node; struct irq_desc *desc; init_irq_default_affinity(); - /* initialize nr_irqs based on nr_cpu_ids */ - arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); + /* Let arch update nr_irqs and return the nr of preallocated irqs */ + initcnt = arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); - for (i = 0; i < NR_IRQS_LEGACY; i++) { + for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..14a7b80b2cce 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -886,9 +886,10 @@ int __init __weak early_irq_init(void) return 0; } +#ifdef CONFIG_GENERIC_HARDIRQS int __init __weak arch_probe_nr_irqs(void) { - return 0; + return NR_IRQS_LEGACY; } int __init __weak arch_early_irq_init(void) @@ -900,3 +901,4 @@ int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } +#endif -- cgit v1.2.2 From b7d0d8258a9f71949b810e0f82a3d75088f4d364 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2010 18:44:23 +0200 Subject: genirq: Remove arch_init_chip_data() This function should have not been there in the first place. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 2 -- kernel/softirq.c | 5 ----- 2 files changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a1fbd1d347af..6c71f8ea5d7d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -197,8 +197,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node) desc = alloc_desc(start + i, node); if (!desc) goto err; - /* temporary until I fixed x86 madness */ - arch_init_chip_data(desc, node); raw_spin_lock_irqsave(&sparse_irq_lock, flags); irq_insert_desc(start + i, desc); raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); diff --git a/kernel/softirq.c b/kernel/softirq.c index 14a7b80b2cce..d19b1c9aa7c5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -896,9 +896,4 @@ int __init __weak arch_early_irq_init(void) { return 0; } - -int __weak arch_init_chip_data(struct irq_desc *desc, int node) -{ - return 0; -} #endif -- cgit v1.2.2 From b7b29338dc7111ed8bd4d6555d84afae13ebe752 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2010 18:46:55 +0200 Subject: genirq: Sanitize dynamic irq handling Use the cleanup functions of the dynamic allocator. No need to have separate implementations. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/chip.c | 102 ------------------------------------------------- kernel/irq/internals.h | 1 - kernel/irq/irqdesc.c | 41 +++++++++++--------- 3 files changed, 23 insertions(+), 121 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3405761d6224..baa5c4acad83 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,108 +18,6 @@ #include "internals.h" -static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc; - unsigned long flags; - - desc = irq_to_desc(irq); - if (!desc) { - WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - return; - } - - /* Ensure we don't have left over values from a previous use of this irq */ - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status = IRQ_DEFAULT_INIT_FLAGS; - desc->irq_data.chip = &no_irq_chip; - desc->handle_irq = handle_bad_irq; - desc->depth = 1; - desc->irq_data.msi_desc = NULL; - desc->irq_data.handler_data = NULL; - if (!keep_chip_data) - desc->irq_data.chip_data = NULL; - desc->action = NULL; - desc->irq_count = 0; - desc->irqs_unhandled = 0; -#ifdef CONFIG_SMP - cpumask_setall(desc->irq_data.affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -#endif - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) -{ - dynamic_irq_init_x(irq, false); -} - -/** - * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->irq_data.chip_data to NULL - */ -void dynamic_irq_init_keep_chip_data(unsigned int irq) -{ - dynamic_irq_init_x(irq, true); -} - -static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) { - WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->action) { - raw_spin_unlock_irqrestore(&desc->lock, flags); - WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", - irq); - return; - } - desc->irq_data.msi_desc = NULL; - desc->irq_data.handler_data = NULL; - if (!keep_chip_data) - desc->irq_data.chip_data = NULL; - desc->handle_irq = handle_bad_irq; - desc->irq_data.chip = &no_irq_chip; - desc->name = NULL; - clear_kstat_irqs(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, false); -} - -/** - * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->irq_data.chip_data to NULL - */ -void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, true); -} - - /** * set_irq_chip - set the irq chip for an irq * @irq: irq number diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8eb01e379ccc..f444203a772d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,7 +20,6 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -extern void clear_kstat_irqs(struct irq_desc *desc); extern raw_spinlock_t sparse_irq_lock; /* Resending of interrupts :*/ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6c71f8ea5d7d..c9d5a1c12874 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -53,12 +53,21 @@ static void desc_smp_init(struct irq_desc *desc, int node) { desc->irq_data.node = node; cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + +static inline int desc_node(struct irq_desc *desc) +{ + return desc->irq_data.node; } #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline int desc_node(struct irq_desc *desc) { return 0; } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) @@ -71,6 +80,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->status = IRQ_DEFAULT_INIT_FLAGS; desc->handle_irq = handle_bad_irq; desc->depth = 1; + desc->irq_count = 0; + desc->irqs_unhandled = 0; desc->name = NULL; memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); desc_smp_init(desc, node); @@ -286,23 +297,9 @@ struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) return irq_to_desc(irq); } -#ifdef CONFIG_SMP -static inline int desc_node(struct irq_desc *desc) -{ - return desc->irq_data.node; -} -#else -static inline int desc_node(struct irq_desc *desc) { return 0; } -#endif - static void free_desc(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc)); - raw_spin_unlock_irqrestore(&desc->lock, flags); + dynamic_irq_cleanup(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) @@ -409,10 +406,18 @@ unsigned int irq_get_next_irq(unsigned int offset) return find_next_bit(allocated_irqs, nr_irqs, offset); } -/* Statistics access */ -void clear_kstat_irqs(struct irq_desc *desc) +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) { - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); } unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -- cgit v1.2.2 From 78f90d91f395cd0dc1ef3f21e0c5cd6fd50d202c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2010 17:18:47 +0200 Subject: genirq: Remove the now unused sparse irq leftovers The move_irq_desc() function was only used due to the problem that the allocator did not free the old descriptors. So the descriptors had to be moved in create_irq_nr(). That's history. The code would have never been able to move active interrupt descriptors on affinity settings. That can be done in a completely different way w/o all this horror. Remove all of it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/Kconfig | 5 -- kernel/irq/Makefile | 1 - kernel/irq/internals.h | 102 --------------------------------------- kernel/irq/irqdesc.c | 30 +----------- kernel/irq/numa_migrate.c | 120 ---------------------------------------------- 5 files changed, 2 insertions(+), 256 deletions(-) delete mode 100644 kernel/irq/numa_migrate.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a42c0191d71a..31d766bf5d2e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -26,11 +26,6 @@ config GENERIC_IRQ_PROBE config GENERIC_PENDING_IRQ def_bool n -if SPARSE_IRQ && NUMA -config NUMA_IRQ_DESC - def_bool n -endif - config AUTO_IRQ_AFFINITY def_bool n diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1eaab0da56db..54329cd7b3ee 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,4 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devr obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f444203a772d..4571ae7e085a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,17 +18,11 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -extern raw_spinlock_t sparse_irq_lock; /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); -#ifdef CONFIG_SPARSE_IRQ -void replace_irq_desc(unsigned int irq, struct irq_desc *desc); -#endif - #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -110,99 +104,3 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) #undef P -/* Stuff below will be cleaned up after the sparse allocator is done */ - -#ifdef CONFIG_SMP -/** - * alloc_desc_masks - allocate cpumasks for irq_desc - * @desc: pointer to irq_desc struct - * @node: node which will be handling the cpumasks - * @boot: true if need bootmem - * - * Allocates affinity and pending_mask cpumask if required. - * Returns true if successful (or not required). - */ -static inline bool alloc_desc_masks(struct irq_desc *desc, int node, - bool boot) -{ - gfp_t gfp = GFP_ATOMIC; - - if (boot) - gfp = GFP_NOWAIT; - -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!alloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) - return false; - -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); - return false; - } -#endif -#endif - return true; -} - -static inline void init_desc_masks(struct irq_desc *desc) -{ - cpumask_setall(desc->irq_data.affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -} - -/** - * init_copy_desc_masks - copy cpumasks for irq_desc - * @old_desc: pointer to old irq_desc struct - * @new_desc: pointer to new irq_desc struct - * - * Insures affinity and pending_masks are copied to new irq_desc. - * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the - * irq_desc struct so the copy is redundant. - */ - -static inline void init_copy_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ -#ifdef CONFIG_CPUMASK_OFFSTACK - cpumask_copy(new_desc->irq_data.affinity, old_desc->irq_data.affinity); - -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); -#endif -#endif -} - -static inline void free_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ - free_cpumask_var(old_desc->irq_data.affinity); - -#ifdef CONFIG_GENERIC_PENDING_IRQ - free_cpumask_var(old_desc->pending_mask); -#endif -} - -#else /* !CONFIG_SMP */ - -static inline bool alloc_desc_masks(struct irq_desc *desc, int node, - bool boot) -{ - return true; -} - -static inline void init_desc_masks(struct irq_desc *desc) -{ -} - -static inline void init_copy_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ -} - -static inline void free_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ -} -#endif /* CONFIG_SMP */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c9d5a1c12874..4f0b9c9d5c46 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -20,7 +20,7 @@ /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ -struct lock_class_key irq_desc_lock_class; +static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) static void __init init_irq_default_affinity(void) @@ -90,28 +90,11 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -DEFINE_RAW_SPINLOCK(sparse_irq_lock); +static DEFINE_RAW_SPINLOCK(sparse_irq_lock); static DECLARE_BITMAP(allocated_irqs, NR_IRQS); #ifdef CONFIG_SPARSE_IRQ -void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) -{ - void *ptr; - - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - - /* - * don't overwite if can not get new one - * init_copy_kstat_irqs() could still use old one - */ - if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); - desc->kstat_irqs = ptr; - } -} - static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -124,15 +107,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return radix_tree_lookup(&irq_desc_tree, irq); } -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - void **ptr; - - ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); - if (ptr) - radix_tree_replace_slot(ptr, desc); -} - static void delete_irq_desc(unsigned int irq) { radix_tree_delete(&irq_desc_tree, irq); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index e7f1f16402c1..000000000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * NUMA irq-desc migration code - * - * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to - * the new "home node" of the IRQ. - */ - -#include -#include -#include -#include -#include -#include - -#include "internals.h" - -static void init_copy_kstat_irqs(struct irq_desc *old_desc, - struct irq_desc *desc, - int node, int nr) -{ - init_kstat_irqs(desc, node, nr); - - if (desc->kstat_irqs != old_desc->kstat_irqs) - memcpy(desc->kstat_irqs, old_desc->kstat_irqs, - nr * sizeof(*desc->kstat_irqs)); -} - -static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) -{ - if (old_desc->kstat_irqs == desc->kstat_irqs) - return; - - kfree(old_desc->kstat_irqs); - old_desc->kstat_irqs = NULL; -} - -static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int node) -{ - memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); - return false; - } - raw_spin_lock_init(&desc->lock); - desc->irq_data.node = node; - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); - init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, node); - return true; -} - -static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) -{ - free_kstat_irqs(old_desc, desc); - free_desc_masks(old_desc, desc); - arch_free_chip_data(old_desc, desc); -} - -static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int node) -{ - struct irq_desc *desc; - unsigned int irq; - unsigned long flags; - - irq = old_desc->irq_data.irq; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - - if (desc && old_desc != desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc " - "for migration.\n", irq); - /* still use old one */ - desc = old_desc; - goto out_unlock; - } - if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { - /* still use old one */ - kfree(desc); - desc = old_desc; - goto out_unlock; - } - - replace_irq_desc(irq, desc); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - /* free the old one */ - free_one_irq_desc(old_desc, desc); - kfree(old_desc); - - return desc; - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - /* those static or target node is -1, do not move them */ - if (desc->irq_data.irq < NR_IRQS_LEGACY || node == -1) - return desc; - - if (desc->irq_data.node != node) - desc = __real_move_irq_desc(desc, node); - - return desc; -} - -- cgit v1.2.2 From a05a900a51c7622ebd8ddb41f14f8bf9db599d8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 12:47:53 +0200 Subject: genirq: Make sparse_lock a mutex No callers from atomic regions. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4f0b9c9d5c46..0e302f90d2ee 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -90,7 +90,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -static DEFINE_RAW_SPINLOCK(sparse_irq_lock); +static DEFINE_MUTEX(sparse_irq_lock); static DECLARE_BITMAP(allocated_irqs, NR_IRQS); #ifdef CONFIG_SPARSE_IRQ @@ -159,13 +159,12 @@ err_desc: static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; unregister_irq_proc(irq, desc); - raw_spin_lock_irqsave(&sparse_irq_lock, flags); + mutex_lock(&sparse_irq_lock); delete_irq_desc(irq); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); free_masks(desc); kfree(desc->kstat_irqs); @@ -175,16 +174,15 @@ static void free_desc(unsigned int irq) static int alloc_descs(unsigned int start, unsigned int cnt, int node) { struct irq_desc *desc; - unsigned long flags; int i; for (i = 0; i < cnt; i++) { desc = alloc_desc(start + i, node); if (!desc) goto err; - raw_spin_lock_irqsave(&sparse_irq_lock, flags); + mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); } return start; @@ -192,9 +190,9 @@ err: for (i--; i >= 0; i--) free_desc(start + i); - raw_spin_lock_irqsave(&sparse_irq_lock, flags); + mutex_lock(&sparse_irq_lock); bitmap_clear(allocated_irqs, start, cnt); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); return -ENOMEM; } @@ -291,7 +289,6 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) */ void irq_free_descs(unsigned int from, unsigned int cnt) { - unsigned long flags; int i; if (from >= nr_irqs || (from + cnt) > nr_irqs) @@ -300,9 +297,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt) for (i = 0; i < cnt; i++) free_desc(from + i); - raw_spin_lock_irqsave(&sparse_irq_lock, flags); + mutex_lock(&sparse_irq_lock); bitmap_clear(allocated_irqs, from, cnt); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); } /** @@ -317,13 +314,12 @@ void irq_free_descs(unsigned int from, unsigned int cnt) int __ref irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) { - unsigned long flags; int start, ret; if (!cnt) return -EINVAL; - raw_spin_lock_irqsave(&sparse_irq_lock, flags); + mutex_lock(&sparse_irq_lock); start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); ret = -EEXIST; @@ -335,11 +331,11 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) goto err; bitmap_set(allocated_irqs, start, cnt); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); return alloc_descs(start, cnt, node); err: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); return ret; } @@ -352,20 +348,19 @@ err: */ int irq_reserve_irqs(unsigned int from, unsigned int cnt) { - unsigned long flags; unsigned int start; int ret = 0; if (!cnt || (from + cnt) > nr_irqs) return -EINVAL; - raw_spin_lock_irqsave(&sparse_irq_lock, flags); + mutex_lock(&sparse_irq_lock); start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); if (start == from) bitmap_set(allocated_irqs, start, cnt); else ret = -EEXIST; - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); + mutex_unlock(&sparse_irq_lock); return ret; } -- cgit v1.2.2 From baa0d233afe765daa6dc01ff233aea8c5944f534 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Oct 2010 15:14:35 +0200 Subject: genirq: Switch sparse_irq allocator to GFP_KERNEL The allocator functions are now called outside of preempt disabled regions. Switch to GFP_KERNEL. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/irq/irqdesc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0e302f90d2ee..f6f660cef7af 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -95,7 +95,7 @@ static DECLARE_BITMAP(allocated_irqs, NR_IRQS); #ifdef CONFIG_SPARSE_IRQ -static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); +static RADIX_TREE(irq_desc_tree, GFP_KERNEL); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { @@ -126,9 +126,8 @@ static inline void free_masks(struct irq_desc *desc) { } static struct irq_desc *alloc_desc(int irq, int node) { - /* Temporary hack until we can switch to GFP_KERNEL */ - gfp_t gfp = gfp_allowed_mask == GFP_BOOT_MASK ? GFP_NOWAIT : GFP_ATOMIC; struct irq_desc *desc; + gfp_t gfp = GFP_KERNEL; desc = kzalloc_node(sizeof(*desc), gfp, node); if (!desc) -- cgit v1.2.2 From 5b8c4f23c54edda640a4850bc9b81dee70acb525 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Sep 2010 14:33:43 +0000 Subject: printk: Make console_sem a semaphore not a pseudo mutex It needs to be investigated whether it can be replaced by a real mutex, but that needs more thought. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Christoph Hellwig LKML-Reference: <20100907125057.179587334@linutronix.de> --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..2531017795f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -556,7 +556,7 @@ static void zap_locks(void) /* If a crash is occurring, make sure we can't deadlock */ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) -- cgit v1.2.2 From c0a19ebc018222ffd1dd93af5b53d9efd779c19b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Oct 2010 21:58:27 +0200 Subject: genirq: Fix CONFIG_GENIRQ_NO_DEPRECATED=y build This option can be set to verify the full conversion to the new chip functions. Fix the fallout of the patch rework, so the core code compiles and works with it. Signed-off-by: Thomas Gleixner --- kernel/irq/dummychip.c | 2 +- kernel/irq/irqdesc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 918dea9de9ea..20dc5474947e 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -31,7 +31,7 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_CRUFT +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED static void compat_noop(unsigned int irq) { } #define END_INIT .end = compat_noop #else diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f6f660cef7af..9d917ff72675 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -118,7 +118,7 @@ static void free_masks(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif - free_cpumask_var(desc->affinity); + free_cpumask_var(desc->irq_data.affinity); } #else static inline void free_masks(struct irq_desc *desc) { } -- cgit v1.2.2 From fb62db2ba943b1683f1d7181bb2988fce4c60870 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 13 Oct 2010 11:02:34 -0700 Subject: futex: Fix kernel-doc notation & typos Convert futex_requeue() function parameters to use @name kernel-doc notation and add @fshared & @cmpval to prevent kernel-doc warnings. Add @list to struct futex_q. Fix a few typos. Signed-off-by: Randy Dunlap Acked-by: Rusty Russell LKML-Reference: <20101013110234.89b06043.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 92a31d4cd564..9b9fda73ba2e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -91,6 +91,7 @@ struct futex_pi_state { /** * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @key: the key the futex is hashed on @@ -104,7 +105,7 @@ struct futex_pi_state { * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakup is always to make the first condition true, then + * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via @@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * - * We have no generic implementation of a non destructive write to the + * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. @@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ pi_state = this->pi_state; /* - * Userspace might have messed up non PI and PI futexes + * Userspace might have messed up non-PI and PI futexes */ if (unlikely(!pi_state)) return -EINVAL; @@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q) /* * We set q->lock_ptr = NULL _before_ we wake up the task. If - * a non futex wake up happens on another CPU then the task - * might exit and p would dereference a non existing task + * a non-futex wake up happens on another CPU then the task + * might exit and p would dereference a non-existing task * struct. Prevent this by holding a reference on p across the * wake up. */ @@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * uaddr1: source futex user address - * uaddr2: target futex user address - * nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * nr_requeue: number of waiters to requeue (0-INT_MAX) - * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * @uaddr1: source futex user address + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire @@ -2651,7 +2654,7 @@ static int __init futex_init(void) * of the complex code paths. Also we want to prevent * registration of robust lists in that case. NULL is * guaranteed to fault and we get -EFAULT on functional - * implementation, the non functional ones will return + * implementation, the non-functional ones will return * -ENOSYS. */ curval = cmpxchg_futex_value_locked(NULL, 0, 0); -- cgit v1.2.2 From 864616ee6785d9fac7a2cd80c01a2da89579f2e4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 14 Oct 2010 16:09:13 +0900 Subject: sched: Comment updates: fix default latency and granularity numbers Targeted preemption latency and minimal preemption granularity for CPU-bound tasks have been changed. This patch updates the comments about these values. Signed-off-by: Takuya Yoshikawa Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20101014160913.eb24fef4.yoshikawa.takuya@oss.ntt.co.jp> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 623e9aceef8f..bf87192e97fe 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,7 +25,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -- cgit v1.2.2 From 907f27840985fe6a0c62e43cd4702c6e04b4bcc7 Mon Sep 17 00:00:00 2001 From: matt mooney Date: Mon, 27 Sep 2010 19:04:53 -0700 Subject: tracing/trivial: Remove cast from void* Unnecessary cast from void* in assignment. Signed-off-by: matt mooney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65fb077ea79c..ebd80d50c474 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..001bcd2ccf4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; -- cgit v1.2.2 From a9d61173dc1cb63e660ae89e874e51ba4fd2f991 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 24 Sep 2010 17:41:02 +0200 Subject: tracing: Add proper check for irq_depth routines The check_irq_entry and check_irq_return could be called from graph event context. In such case there's no graph private data allocated. Adding checks to handle this case. Signed-off-by: Jiri Olsa LKML-Reference: <20100924154102.GB1818@jolsa.brq.redhat.com> [ Fixed some grammar in the comments ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ef49e9370b25..4c58ccc6427c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -888,12 +888,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags, unsigned long addr, int depth) { int cpu = iter->cpu; + int *depth_irq; struct fgraph_data *data = iter->private; - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - if (flags & TRACE_GRAPH_PRINT_IRQS) + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) return 0; + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + /* * We are inside the irq code */ @@ -926,12 +934,20 @@ static int check_irq_return(struct trace_iterator *iter, u32 flags, int depth) { int cpu = iter->cpu; + int *depth_irq; struct fgraph_data *data = iter->private; - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - if (flags & TRACE_GRAPH_PRINT_IRQS) + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) return 0; + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + /* * We are not inside the irq code. */ -- cgit v1.2.2 From 0a772620a2e21fb55a02f70fe38d4b5c3a5fbbbf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Sep 2010 14:00:52 +0200 Subject: tracing: Make graph related irqs/preemptsoff functions global Move trace_graph_function() and print_graph_headers_flags() functions to the trace_function_graph.c to be globaly available. Signed-off-by: Jiri Olsa LKML-Reference: <1285243253-7372-3-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 +++ kernel/trace/trace_functions_graph.c | 63 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace_irqsoff.c | 56 ++++---------------------------- 3 files changed, 71 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4c58ccc6427c..6f8fe28acba1 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -262,6 +262,35 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, parent_ip, flags, pc); + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -1179,7 +1208,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1242,7 +1271,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1274,7 +1314,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1315,6 +1355,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..4047e98afcba 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } -static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else -- cgit v1.2.2 From 7495a5beaa22f190f4888aa8cbe4827c16575d0a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Sep 2010 14:00:53 +0200 Subject: tracing: Graph support for wakeup tracer Add function graph support for wakeup latency tracer. The graph output is enabled by setting the 'display-graph' trace option. Signed-off-by: Jiri Olsa LKML-Reference: <1285243253-7372-4-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 231 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 221 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..033510dbb322 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,13 +31,33 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * wakeup uses its own tracer function to keep the overhead down: */ static void wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) @@ -80,8 +100,191 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; + +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, pc, ret = 0; + + if (likely(!wakeup_task)) + return 0; + + pc = preempt_count(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + +out: + atomic_dec(&data->disabled); + +out_enable: + preempt_enable_notrace(); + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, pc; + + if (likely(!wakeup_task)) + return; + + pc = preempt_count(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + +out: + atomic_dec(&data->disabled); + +out_enable: + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -152,7 +355,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -252,7 +455,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -303,12 +506,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -320,7 +519,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -379,9 +578,15 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; @@ -394,9 +599,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; -- cgit v1.2.2 From 542181d3769d001c59cd17573dd4381e87d215f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Oct 2010 16:38:49 -0400 Subject: tracing: Use one prologue for the wakeup tracer function tracers The wakeup tracer has three types of function tracers. Normal function tracer, function graph entry, and function graph return. Each of these use a complex dance to prevent recursion and whether to trace the data or not (depending on the wake_task variable). This patch moves the duplicate code into a single routine, to prevent future mistakes with modifying duplicate complex code. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 102 +++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 033510dbb322..31689d2df7f3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -56,43 +56,73 @@ static struct tracer_flags tracer_flags = { #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) #ifdef CONFIG_FUNCTION_TRACER + /* - * wakeup uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); + *pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} + +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: preempt_enable_notrace(); } @@ -154,32 +184,16 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu, pc, ret = 0; + int pc, ret = 0; - if (likely(!wakeup_task)) + if (!func_prolog_preempt_disable(tr, &data, &pc)) return 0; - pc = preempt_count(); - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - if (cpu != wakeup_current_cpu) - goto out_enable; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1)) - goto out; - local_save_flags(flags); ret = __trace_graph_entry(tr, trace, flags, pc); - -out: atomic_dec(&data->disabled); - -out_enable: preempt_enable_notrace(); + return ret; } @@ -188,31 +202,15 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu, pc; + int pc; - if (likely(!wakeup_task)) + if (!func_prolog_preempt_disable(tr, &data, &pc)) return; - pc = preempt_count(); - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - if (cpu != wakeup_current_cpu) - goto out_enable; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1)) - goto out; - local_save_flags(flags); __trace_graph_return(tr, trace, flags, pc); - -out: atomic_dec(&data->disabled); -out_enable: preempt_enable_notrace(); return; } -- cgit v1.2.2 From 5e6d2b9cfa3a6e7fe62fc0135bc1bd778f5db564 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Oct 2010 19:41:43 -0400 Subject: tracing: Use one prologue for the preempt irqs off tracer function tracers The preempt and irqsoff tracers have three types of function tracers. Normal function tracer, function graph entry, and function graph return. Each of these use a complex dance to prevent recursion and whether to trace the data or not (depending if interrupts are enabled or not). This patch moves the duplicate code into a single routine, to prevent future mistakes with modifying duplicate complex code. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 96 ++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 4047e98afcba..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) - return 0; - - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) - return; - - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) + if (!func_prolog_dec(tr, &data, &flags)) return; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } -- cgit v1.2.2 From 78c89ba121221d9224a5747803d7fffe51cd6e44 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Oct 2010 23:22:19 -0400 Subject: tracing: Remove parent recording in latency tracer graph options Even though the parent is recorded with the normal function tracing of the latency tracers (irqsoff and wakeup), the function graph recording is bogus. This is due to the function graph messing with the return stack. The latency tracers pass in as the parent CALLER_ADDR0, which works fine for plain function tracing. But this causes bogus output with the graph tracer: 3) -0 | d.s3. 0.000 us | return_to_handler(); 3) -0 | d.s3. 0.000 us | _raw_spin_unlock_irqrestore(); 3) -0 | d.s3. 0.000 us | return_to_handler(); 3) -0 | d.s3. 0.000 us | trace_hardirqs_on(); The "return_to_handle()" call is the trampoline of the function graph tracer, and is meaningless in this context. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f8fe28acba1..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -287,7 +287,6 @@ trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - __trace_graph_function(tr, parent_ip, flags, pc); __trace_graph_function(tr, ip, flags, pc); } -- cgit v1.2.2 From 4924627423d5e286136ad2520f5be536345ae590 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Oct 2010 21:46:10 +0200 Subject: sched: Unindent labels Labels should be on column 0. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ kernel/sched_rt.c | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2111491f6424..7f522832250c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4891,7 +4891,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { @@ -8141,9 +8141,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(cfs_rq); - err: +err: return 0; } @@ -8231,9 +8231,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(rt_rq); - err: +err: return 0; } @@ -8591,7 +8591,7 @@ static int tg_set_bandwidth(struct task_group *tg, raw_spin_unlock(&rt_rq->rt_runtime_lock); } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index baef30f08405..ab77aa00b7b1 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1140,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) for_each_leaf_rt_rq(rt_rq, rq) { array = &rt_rq->active; idx = sched_find_first_bit(array->bitmap); - next_idx: +next_idx: if (idx >= MAX_RT_PRIO) continue; if (next && next->prio < idx) @@ -1316,7 +1316,7 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; - retry: +retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); return 0; @@ -1464,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq) * but possible) */ } - skip: +skip: double_unlock_balance(this_rq, src_rq); } -- cgit v1.2.2 From 34f971f6f7988be4d014eec3e3526bee6d007ffa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Sep 2010 13:53:15 +0200 Subject: sched: Create special class for stop/migrate work In order to separate the stop/migrate work thread from the SCHED_FIFO implementation, create a special class for it that is of higher priority than SCHED_FIFO itself. This currently solves a problem where cpu-hotplug consumes so much cpu-time that the SCHED_FIFO class gets throttled, but has the bandwidth replenishment timer pending on the now dead cpu. It is also required for when we add the planned deadline scheduling class above SCHED_FIFO, as the stop/migrate thread still needs to transcent those tasks. Tested-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <1285165776.2275.1022.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 54 ++++++++++++++++++++---- kernel/sched_stoptask.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/stop_machine.c | 8 ++-- 3 files changed, 158 insertions(+), 12 deletions(-) create mode 100644 kernel/sched_stoptask.c (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7f522832250c..5f64fed56a44 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -486,7 +486,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; @@ -1837,7 +1837,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1917,10 +1917,41 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -3720,17 +3751,13 @@ pick_next_task(struct rq *rq) return p; } - class = sched_class_highest; - for ( ; ; ) { + for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; } + + BUG(); /* the idle class will always have a runnable task */ } /* @@ -4659,6 +4686,15 @@ recheck: */ rq = __task_rq_lock(p); + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EINVAL; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 000000000000..45bddc0c1048 --- /dev/null +++ b/kernel/sched_stoptask.c @@ -0,0 +1,108 @@ +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct rq *rq, struct task_struct *p, + int sd_flag, int flags) +{ + return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ + resched_task(rq->curr); /* we preempt everything */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + if (stop && stop->state == TASK_RUNNING) + return stop; + + return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void yield_task_stop(struct rq *rq) +{ + BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p, + int running) +{ + BUG(); /* its impossible to change to this class */ +} + +static void prio_changed_stop(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +static const struct sched_class stop_sched_class = { + .next = &rt_sched_class, + + .enqueue_task = enqueue_task_stop, + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + + .check_preempt_curr = check_preempt_curr_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_stop, +#endif + + .set_curr_task = set_curr_task_stop, + .task_tick = task_tick_stop, + + .get_rr_interval = get_rr_interval_stop, + + .prio_changed = prio_changed_stop, + .switched_to = switched_to_stop, + + /* no .task_new for stop tasks */ +}; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4372ccb25127..090c28812ce1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -287,11 +287,12 @@ repeat: goto repeat; } +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct task_struct *p; @@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, cpu); if (IS_ERR(p)) return NOTIFY_BAD; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); get_task_struct(p); + kthread_bind(p, cpu); + sched_set_stop_task(cpu, p); stopper->thread = p; break; case CPU_ONLINE: - kthread_bind(stopper->thread, cpu); /* strictly unnecessary, as first user will wake it */ wake_up_process(stopper->thread); /* mark enabled */ @@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, { struct cpu_stop_work *work; + sched_set_stop_task(cpu, NULL); /* kill the stopper */ kthread_stop(stopper->thread); /* drain remaining works */ -- cgit v1.2.2 From 17bdcf949d03306b308c5fb694849cd35f119807 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 11 Oct 2010 16:36:51 +0200 Subject: sched: Drop all load weight manipulation for RT tasks Load weights are for the CFS, they do not belong in the RT task. This makes all RT scheduling classes leave the CFS weights alone. This fixes a real bug as well: I noticed the following phonomena: a process elevated to SCHED_RR forks with SCHED_RESET_ON_FORK set, and the child is indeed SCHED_OTHER, and the niceval is indeed reset to 0. However the weight inserted by set_load_weight() remains at 0, giving the task insignificat priority. With this fix, the weight is reset to what the task had before being elevated to SCHED_RR/SCHED_FIFO. Cc: Lennart Poettering Cc: stable@kernel.org Signed-off-by: Linus Walleij Signed-off-by: Peter Zijlstra LKML-Reference: <1286807811-10568-1-git-send-email-linus.walleij@stericsson.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5f64fed56a44..728081a7ef1c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1855,12 +1855,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ -- cgit v1.2.2 From 620162505e5d46bc4494b1761743e4b0b3bf8e16 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Tue, 5 Oct 2010 18:01:51 +0900 Subject: lockdep: Add improved subclass caching Current lockdep_map only caches one class with subclass == 0, and looks up hash table of classes when subclass != 0. It seems that this has no problem because the case of subclass != 0 is rare. But locks of struct rq are acquired with subclass == 1 when task migration is executed. Task migration is high frequent event, so I modified lockdep to cache subclasses. I measured the score of perf bench sched messaging. This patch has slightly but certain (order of milli seconds or 10 milli seconds) effect when lots of tasks are running. I'll show the result in the tail of this description. NR_LOCKDEP_CACHING_CLASSES specifies how many classes can be cached in the instances of lockdep_map. I discussed with Peter Zijlstra in LinuxCon Japan about this approach and he taught me that caching every subclasses(8) is cleary waste of memory. So number of cached classes should be configurable. === Score comparison of benchmarks === # "min" means best score, and "max" means worst score for i in `seq 1 10`; do ./perf bench -f simple sched messaging; done before: min: 0.565000, max: 0.583000, avg: 0.572500 after: min: 0.559000, max: 0.568000, avg: 0.563300 # with more processes for i in `seq 1 10`; do ./perf bench -f simple sched messaging -g 40; done before: min: 2.274000, max: 2.298000, avg: 2.286300 after: min: 2.242000, max: 2.270000, avg: 2.259700 Signed-off-by: Hitoshi Mitake Cc: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1286269311-28336-2-git-send-email-mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 84baa71cfda5..bc4d32871f9a 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -774,7 +774,9 @@ out_unlock_set: raw_local_irq_restore(flags); if (!subclass || force) - lock->class_cache = class; + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -2679,7 +2681,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - lock->class_cache = NULL; + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif @@ -2750,10 +2756,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (lock->key == &__lockdep_no_validate__) check = 1; - if (!subclass) - class = lock->class_cache; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; /* - * Not cached yet or subclass? + * Not cached? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); @@ -2918,7 +2924,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache; + struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3559,7 +3565,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; -- cgit v1.2.2 From 4ba053c04aece1f4734056f21b751eee47ea3fb1 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Wed, 13 Oct 2010 17:30:26 +0900 Subject: lockdep: Check the depth of subclass Current look_up_lock_class() doesn't check the parameter "subclass". This rarely rises problems because the main caller of this function, register_lock_class(), checks it. But register_lock_class() is not the only function which calls look_up_lock_class(). lock_set_class() and its callees also call it. And lock_set_class() doesn't check this parameter. This will rise problems when the the value of subclass is larger than MAX_LOCKDEP_SUBCLASSES. Because the address (used as the key of class) caliculated with too large subclass has a probability to point another key in different lock_class_key. Of course this problem depends on the memory layout and occurs with really low probability. Signed-off-by: Hitoshi Mitake Cc: Dmitry Torokhov Cc: Vojtech Pavlik Cc: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1286958626-986-1-git-send-email-mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index bc4d32871f9a..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } #endif + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: @@ -2745,14 +2755,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - if (lock->key == &__lockdep_no_validate__) check = 1; -- cgit v1.2.2 From ef8002f6848236de5adc613063ebeabddea8a6fb Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 13 Oct 2010 12:09:35 -0700 Subject: sched: Do not consider SCHED_IDLE tasks to be cache hot This patch adds a check in task_hot to return if the task has SCHED_IDLE policy. SCHED_IDLE tasks have very low weight, and when run with regular workloads, are typically scheduled many milliseconds apart. There is no need to consider these tasks hot for load balancing. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1287173550-30365-2-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 728081a7ef1c..771b518e5f1f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2025,6 +2025,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + /* * Buddy candidates are cache hot: */ -- cgit v1.2.2 From 2582f0eba54066b5e98ff2b27ef0cfa833b59f54 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 13 Oct 2010 12:09:36 -0700 Subject: sched: Set group_imb only a task can be pulled from the busiest cpu When cycling through sched groups to determine the busiest group, set group_imb only if the busiest cpu has more than 1 runnable task. This patch fixes the case where two cpus in a group have one runnable task each, but there is a large weight differential between these two tasks. The load balancer is unable to migrate any task from this group, and hence do not consider this group to be imbalanced. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1286996978-7007-3-git-send-email-ncrao@google.com> [ small code readability edits ] Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bf87192e97fe..3656480e0f79 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2378,7 +2378,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load; + unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; @@ -2389,6 +2389,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; + max_nr_running = 0; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); @@ -2406,8 +2407,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) + if (load > max_cpu_load) { max_cpu_load = load; + max_nr_running = rq->nr_running; + } if (min_cpu_load > load) min_cpu_load = load; } @@ -2447,11 +2450,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = - DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); } -- cgit v1.2.2 From fab476228ba37907ad75216d0fd9732ada9c119e Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Fri, 15 Oct 2010 13:12:29 -0700 Subject: sched: Force balancing on newidle balance if local group has capacity This patch forces a load balance on a newly idle cpu when the local group has extra capacity and the busiest group does not have any. It improves system utilization when balancing tasks with a large weight differential. Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. With this patch, if the local group has extra capacity, we shortcut the checks in f_b_g() and try to pull a task over. A sched group has extra capacity if the group capacity is greater than the number of running tasks in that group. Thanks to Mike Galbraith for discussions leading to this patch and for the insight to reuse SD_NEWIDLE_BALANCE. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1287173550-30365-4-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3656480e0f79..032b548be0fc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); + + /* re-arm NEWIDLE balancing when moving tasks */ + src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; + this_rq->idle_stamp = 0; } /* @@ -2030,12 +2034,14 @@ struct sd_lb_stats { unsigned long this_load; unsigned long this_load_per_task; unsigned long this_nr_running; + unsigned long this_has_capacity; /* Statistics of the busiest group */ unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2058,6 +2064,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ }; /** @@ -2456,6 +2463,9 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; } /** @@ -2554,12 +2564,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; + sds->this_has_capacity = sgs.group_has_capacity; } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; sds->group_imb = sgs.group_imb; } @@ -2756,6 +2768,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } + /******* find_busiest_group() helpers end here *********************/ /** @@ -2807,6 +2820,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. + * + * Note: when doing newidle balance, if the local group has excess + * capacity (i.e. nr_running < group_capacity) and the busiest group + * does not have any capacity, we force a load balance to pull tasks + * to the local group. In this case, we skip past checks 3, 4 and 5. */ if (!(*balance)) goto ret; @@ -2818,6 +2836,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + if (sds.this_load >= sds.max_load) goto out_balanced; @@ -2829,6 +2852,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; +force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -3162,10 +3186,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } raw_spin_lock(&this_rq->lock); -- cgit v1.2.2 From 75dd321d79d495a0ee579e6249ebc38ddbb2667f Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Fri, 15 Oct 2010 13:12:30 -0700 Subject: sched: Drop group_capacity to 1 only if local group has extra capacity When SD_PREFER_SIBLING is set on a sched domain, drop group_capacity to 1 only if the local group has extra capacity. The extra check prevents the case where you always pull from the heaviest group when it is already under-utilized (possible with a large weight task outweighs the tasks on the system). For example, consider a 16-cpu quad-core quad-socket machine with MC and NUMA scheduling domains. Let's say we spawn 15 nice0 tasks and one nice-15 task, and each task is running on one core. In this case, we observe the following events when balancing at the NUMA domain: - find_busiest_group() will always pick the sched group containing the niced task to be the busiest group. - find_busiest_queue() will then always pick one of the cpus running the nice0 task (never picks the cpu with the nice -15 task since weighted_cpuload > imbalance). - The load balancer fails to migrate the task since it is the running task and increments sd->nr_balance_failed. - It repeats the above steps a few more times until sd->nr_balance_failed > 5, at which point it kicks off the active load balancer, wakes up the migration thread and kicks the nice 0 task off the cpu. The load balancer doesn't stop until we kick out all nice 0 tasks from the sched group, leaving you with 3 idle cpus and one cpu running the nice -15 task. When balancing at the NUMA domain, we drop sgs.group_capacity to 1 if the child domain (in this case MC) has SD_PREFER_SIBLING set. Subsequent load checks are not relevant because the niced task has a very large weight. In this patch, we add an extra condition to the "if(prefer_sibling)" check in update_sd_lb_stats(). We drop the capacity of a group only if the local group has extra capacity, ie. nr_running < group_capacity. This patch preserves the original intent of the prefer_siblings check (to spread tasks across the system in low utilization scenarios) and fixes the case above. It helps in the following ways: - In low utilization cases (where nr_tasks << nr_cpus), we still drop group_capacity down to 1 if we prefer siblings. - On very busy systems (where nr_tasks >> nr_cpus), sgs.nr_running will most likely be > sgs.group_capacity. - When balancing large weight tasks, if the local group does not have extra capacity, we do not pick the group with the niced task as the busiest group. This prevents failed balances, active migration and the under-utilization described above. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1287173550-30365-5-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 032b548be0fc..f1c615ff39d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2554,9 +2554,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try - * and move all the excess tasks away. + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling) + if (prefer_sibling && !local_group && sds->this_has_capacity) sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { -- cgit v1.2.2 From 75e1056f5c57050415b64cb761a3acc35d91f013 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:16 -0700 Subject: sched: Fix softirq time accounting Peter Zijlstra found a bug in the way softirq time is accounted in VIRT_CPU_ACCOUNTING on this thread: http://lkml.indiana.edu/hypermail//linux/kernel/1009.2/01366.html The problem is, softirq processing uses local_bh_disable internally. There is no way, later in the flow, to differentiate between whether softirq is being processed or is it just that bh has been disabled. So, a hardirq when bh is disabled results in time being wrongly accounted as softirq. Looking at the code a bit more, the problem exists in !VIRT_CPU_ACCOUNTING as well. As account_system_time() in normal tick based accouting also uses softirq_count, which will be set even when not in softirq with bh disabled. Peter also suggested solution of using 2*SOFTIRQ_OFFSET as irq count for local_bh_{disable,enable} and using just SOFTIRQ_OFFSET while softirq processing. The patch below does that and adds API in_serving_softirq() which returns whether we are currently processing softirq or not. Also changes one of the usages of softirq_count in net/sched/cls_cgroup.c to in_serving_softirq. Looks like many usages of in_softirq really want in_serving_softirq. Those changes can be made individually on a case by case basis. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-2-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/softirq.c | 51 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 771b518e5f1f..089be8adb074 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3422,7 +3422,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (in_serving_softirq()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..988dfbe6bbe8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -76,12 +76,22 @@ void wakeup_softirqd(void) wake_up_process(tsk); } +/* + * preempt_count and SOFTIRQ_OFFSET usage: + * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving + * softirq processing. + * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ + /* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip) +static void __local_bh_disable(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += SOFTIRQ_OFFSET; + preempt_count() += cnt; /* * Were softirqs turned off above: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == SOFTIRQ_OFFSET) + if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip) +static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - add_preempt_count(SOFTIRQ_OFFSET); + add_preempt_count(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); +static void __local_bh_enable(unsigned int cnt) +{ + WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == cnt) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + sub_preempt_count(cnt); +} + /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); - WARN_ON_ONCE(!irqs_disabled()); - - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); - sub_preempt_count(SOFTIRQ_OFFSET); + __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); @@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) /* * Are softirqs going to be turned on now: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) trace_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); + sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); @@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); @@ -245,7 +262,7 @@ restart: lockdep_softirq_exit(); account_system_vtime(current); - _local_bh_enable(); + __local_bh_enable(SOFTIRQ_OFFSET); } #ifndef __ARCH_HAS_DO_SOFTIRQ -- cgit v1.2.2 From 6cdd5199daf0cb7b0fcc8dca941af08492612887 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:18 -0700 Subject: sched: Add a PF flag for ksoftirqd identification To account softirq time cleanly in scheduler, we need to identify whether softirq is invoked in ksoftirqd context or softirq at hardirq tail context. Add PF_KSOFTIRQD for that purpose. As all PF flag bits are currently taken, create space by moving one of the infrequently used bits (PF_THREAD_BOUND) down in task_struct to be along with some other state fields. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-4-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 988dfbe6bbe8..267f7b763ebb 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -713,6 +713,7 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); + current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { -- cgit v1.2.2 From b52bfee445d315549d41eacf2fa7c156e7d153d5 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:19 -0700 Subject: sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time s390/powerpc/ia64 have support for CONFIG_VIRT_CPU_ACCOUNTING which does the fine granularity accounting of user, system, hardirq, softirq times. Adding that option on archs like x86 will be challenging however, given the state of TSC reliability on various platforms and also the overhead it will add in syscall entry exit. Instead, add a lighter variant that only does finer accounting of hardirq and softirq times, providing precise irq times (instead of timer tick based samples). This accounting is added with a new config option CONFIG_IRQ_TIME_ACCOUNTING so that there won't be any overhead for users not interested in paying the perf penalty. This accounting is based on sched_clock, with the code being generic. So, other archs may find it useful as well. This patch just adds the core logic and does not enable this logic yet. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-5-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 089be8adb074..9b302e355791 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1908,6 +1908,55 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + int cpu; + u64 now, delta; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + now = sched_clock(); + cpu = smp_processor_id(); + delta = now - per_cpu(irq_start_time, cpu); + per_cpu(irq_start_time, cpu) = now; + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + per_cpu(cpu_hardirq_time, cpu) += delta; + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + per_cpu(cpu_softirq_time, cpu) += delta; + + local_irq_restore(flags); +} + +#endif + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" -- cgit v1.2.2 From 305e6835e05513406fa12820e40e4a8ecb63743c Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:21 -0700 Subject: sched: Do not account irq time to current task Scheduler accounts both softirq and interrupt processing times to the currently running task. This means, if the interrupt processing was for some other task in the system, then the current task ends up being penalized as it gets shorter runtime than otherwise. Change sched task accounting to acoount only actual task time from currently running task. Now update_curr(), modifies the delta_exec to depend on rq->clock_task. Note that this change only handles CONFIG_IRQ_TIME_ACCOUNTING case. We can extend this to CONFIG_VIRT_CPU_ACCOUNTING with minimal effort. But, thats for later. This change will impact scheduling behavior in interrupt heavy conditions. Tested on a 4-way system with eth0 handled by CPU 2 and a network heavy task (nc) running on CPU 3 (and no RSS/RFS). With that I have CPU 2 spending 75%+ of its time in irq processing. CPU 3 spending around 35% time running nc task. Now, if I run another CPU intensive task on CPU 2, without this change /proc//schedstat shows 100% of time accounted to this task. With this change, it rightly shows less than 25% accounted to this task as remaining time is actually spent on irq processing. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-7-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 ++++++++++++++++++++++++++++++++++++++++--- kernel/sched_fair.c | 6 +++--- kernel/sched_rt.c | 8 ++++---- 3 files changed, 47 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9b302e355791..9e01b7100ef6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -491,6 +491,7 @@ struct rq { struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -641,10 +642,19 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); + inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) { + int cpu = cpu_of(rq); + u64 irq_time; + + rq->clock = sched_clock_cpu(cpu); + irq_time = irq_time_cpu(cpu); + if (rq->clock - irq_time > rq->clock_task) + rq->clock_task = rq->clock - irq_time; + } } /* @@ -1910,6 +1920,18 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ static DEFINE_PER_CPU(u64, cpu_hardirq_time); static DEFINE_PER_CPU(u64, cpu_softirq_time); @@ -1926,6 +1948,14 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static u64 irq_time_cpu(int cpu) +{ + if (!sched_clock_irqtime) + return 0; + + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + void account_system_vtime(struct task_struct *curr) { unsigned long flags; @@ -1955,6 +1985,13 @@ void account_system_vtime(struct task_struct *curr) local_irq_restore(flags); } +#else + +static u64 irq_time_cpu(int cpu) +{ + return 0; +} + #endif #include "sched_idletask.c" @@ -3322,7 +3359,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock - p->se.exec_start; + ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f1c615ff39d6..c358d4081b81 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + u64 now = rq_of(cfs_rq)->clock_task; unsigned long delta_exec; if (unlikely(!curr)) @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock; + se->exec_start = rq_of(cfs_rq)->clock_task; } /************************************************** @@ -1802,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock, sd); + tsk_cache_hot = task_hot(p, rq->clock_task, sd); if (!tsk_cache_hot || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ab77aa00b7b1..bea7d79f7e9c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq) if (!task_has_rt_policy(curr)) return; - delta_exec = rq->clock - curr->se.exec_start; + delta_exec = rq->clock_task - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock; + curr->se.exec_start = rq->clock_task; cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1075,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; return p; } @@ -1713,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); -- cgit v1.2.2 From aa483808516ca5cacfa0e5849691f64fec25828e Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:22 -0700 Subject: sched: Remove irq time from available CPU power The idea was suggested by Peter Zijlstra here: http://marc.info/?l=linux-kernel&m=127476934517534&w=2 irq time is technically not available to the tasks running on the CPU. This patch removes irq time from CPU power piggybacking on sched_rt_avg_update(). Tested this by keeping CPU X busy with a network intensive task having 75% oa a single CPU irq processing (hard+soft) on a 4-way system. And start seven cycle soakers on the system. Without this change, there will be two tasks on each CPU. With this change, there is a single task on irq busy CPU X and remaining 7 tasks are spread around among other 3 CPUs. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-8-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 ++++++++++++++++++ kernel/sched_fair.c | 8 +++++++- kernel/sched_features.h | 5 +++++ 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9e01b7100ef6..bff9ef537df0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -519,6 +519,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -643,6 +647,7 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); inline void update_rq_clock(struct rq *rq) { @@ -654,6 +659,8 @@ inline void update_rq_clock(struct rq *rq) irq_time = irq_time_cpu(cpu); if (rq->clock - irq_time > rq->clock_task) rq->clock_task = rq->clock - irq_time; + + sched_irq_time_avg_update(rq, irq_time); } } @@ -1985,6 +1992,15 @@ void account_system_vtime(struct task_struct *curr) local_irq_restore(flags); } +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ + if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { + u64 delta_irq = curr_irq_time - rq->prev_irq_time; + rq->prev_irq_time = curr_irq_time; + sched_rt_avg_update(rq, delta_irq); + } +} + #else static u64 irq_time_cpu(int cpu) @@ -1992,6 +2008,8 @@ static u64 irq_time_cpu(int cpu) return 0; } +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + #endif #include "sched_idletask.c" diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c358d4081b81..74cccfae87a8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2275,7 +2275,13 @@ unsigned long scale_rt_power(int cpu) u64 total, available; total = sched_avg_period() + (rq->clock - rq->age_stamp); - available = total - rq->rt_avg; + + if (unlikely(total < rq->rt_avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - rq->rt_avg; + } if (unlikely((s64)total < SCHED_LOAD_SCALE)) total = SCHED_LOAD_SCALE; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8ad3ee..185f920ec1a2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) * release the lock. Decreases scheduling overhead. */ SCHED_FEAT(OWNER_SPIN, 1) + +/* + * Decrement CPU power based on irq activity + */ +SCHED_FEAT(NONIRQ_POWER, 1) -- cgit v1.2.2 From d267f87fb8179c6dba03d08b91952e81bc3723c7 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:23 -0700 Subject: sched: Call tick_check_idle before __irq_enter When CPU is idle and on first interrupt, irq_enter calls tick_check_idle() to notify interruption from idle. But, there is a problem if this call is done after __irq_enter, as all routines in __irq_enter may find stale time due to yet to be done tick_check_idle. Specifically, trace calls in __irq_enter when they use global clock and also account_system_vtime change in this patch as it wants to use sched_clock_cpu() to do proper irq timing. But, tick_check_idle was moved after __irq_enter intentionally to prevent problem of unneeded ksoftirqd wakeups by the commit ee5f80a: irq: call __irq_enter() before calling the tick_idle_check Impact: avoid spurious ksoftirqd wakeups Moving tick_check_idle() before __irq_enter and wrapping it with local_bh_enable/disable would solve both the problems. Fixed-by: Yong Zhang Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-9-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/softirq.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bff9ef537df0..567f5cb9808c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1974,8 +1974,8 @@ void account_system_vtime(struct task_struct *curr) local_irq_save(flags); - now = sched_clock(); cpu = smp_processor_id(); + now = sched_clock_cpu(cpu); delta = now - per_cpu(irq_start_time, cpu); per_cpu(irq_start_time, cpu) = now; /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 267f7b763ebb..79ee8f1fc0e7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -296,10 +296,16 @@ void irq_enter(void) rcu_irq_enter(); if (idle_cpu(cpu) && !in_interrupt()) { - __irq_enter(); + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); tick_check_idle(cpu); - } else - __irq_enter(); + _local_bh_enable(); + } + + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -- cgit v1.2.2 From b7dadc38797584f6203386da1947ed5edf516646 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 Oct 2010 20:00:37 +0200 Subject: sched: Export account_system_vtime() KVM uses it for example: ERROR: "account_system_vtime" [arch/x86/kvm/kvm.ko] undefined! Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <1286237003-12406-3-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 567f5cb9808c..5998222f901c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1991,6 +1991,7 @@ void account_system_vtime(struct task_struct *curr) local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(account_system_vtime); static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { -- cgit v1.2.2 From 7ada876a8703f23befbb20a7465a702ee39b1704 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 17 Oct 2010 08:35:04 -0700 Subject: futex: Fix errors in nested key ref-counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit futex_wait() is leaking key references due to futex_wait_setup() acquiring an additional reference via the queue_lock() routine. The nested key ref-counting has been masking bugs and complicating code analysis. queue_lock() is only called with a previously ref-counted key, so remove the additional ref-counting from the queue_(un)lock() functions. Also futex_wait_requeue_pi() drops one key reference too many in unqueue_me_pi(). Remove the key reference handling from unqueue_me_pi(). This was paired with a queue_lock() in futex_lock_pi(), so the count remains unchanged. Document remaining nested key ref-counting sites. Signed-off-by: Darren Hart Reported-and-tested-by: Matthieu Fertré Reported-by: Louis Rilling Cc: Peter Zijlstra Cc: Eric Dumazet Cc: John Kacur Cc: Rusty Russell LKML-Reference: <4CBB17A8.70401@linux.intel.com> Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/futex.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..e328f574c97c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1363,7 +1363,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1375,7 +1374,6 @@ static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); } /** @@ -1480,8 +1478,6 @@ static void unqueue_me_pi(struct futex_q *q) q->pi_state = NULL; spin_unlock(q->lock_ptr); - - drop_futex_key_refs(&q->key); } /* @@ -1812,7 +1808,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, } retry: - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out; @@ -1822,24 +1821,23 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; + /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) - goto out_put_key; + goto out; ret = -ETIMEDOUT; if (to && !to->task) - goto out_put_key; + goto out; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ - if (!signal_pending(current)) { - put_futex_key(fshared, &q.key); + if (!signal_pending(current)) goto retry; - } ret = -ERESTARTSYS; if (!abs_time) - goto out_put_key; + goto out; restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; @@ -1856,8 +1854,6 @@ retry: ret = -ERESTART_RESTARTBLOCK; -out_put_key: - put_futex_key(fshared, &q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2236,7 +2232,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out_key2; @@ -2254,7 +2253,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * In order for us to be here, we know our q.key == key2, and since * we took the hb->lock above, we also know that futex_requeue() has * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquition by the requeue code. + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. */ /* Check if the requeue code acquired the second futex for us. */ -- cgit v1.2.2 From 7e40798f406fe73f9bac496a390daabd8768a8f7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 10:56:19 -0400 Subject: tracing: Fix compile issue for trace_sched_wakeup.c The function start_func_tracer() was incorrectly added in the #ifdef CONFIG_FUNCTION_TRACER condition, but is still used even when function tracing is not enabled. The calls to register_ftrace_function() and register_ftrace_graph() become nops (and their arguments are even ignored), thus there is no reason to hide start_func_tracer() when function tracing is not enabled. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 31689d2df7f3..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,6 +130,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; +#endif /* CONFIG_FUNCTION_TRACER */ static int start_func_tracer(int graph) { @@ -159,8 +160,6 @@ static void stop_func_tracer(int graph) unregister_ftrace_graph(); } -#endif /* CONFIG_FUNCTION_TRACER */ - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_set_flag(u32 old_flags, u32 bit, int set) { -- cgit v1.2.2 From b0ae19811375031ae3b3fecc65b702a9c6e5cc28 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Oct 2010 04:21:18 +0900 Subject: security: remove unused parameter from security_task_setscheduler() All security modules shouldn't change sched_param parameter of security_task_setscheduler(). This is not only meaningless, but also make a harmful result if caller pass a static variable. This patch remove policy and sched_param parameter from security_task_setscheduler() becuase none of security module is using it. Cc: James Morris Signed-off-by: KOSAKI Motohiro Signed-off-by: James Morris --- kernel/cpuset.c | 4 ++-- kernel/sched.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk); if (ret) return ret; if (threadgroup) { @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); + ret = security_task_setscheduler(c); if (ret) { rcu_read_unlock(); return ret; diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..df6579d9b4df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4645,7 +4645,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4887,7 +4887,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; -- cgit v1.2.2