From aa2bf9bc6414b6972b9e51903c1ce7b1f057aee2 Mon Sep 17 00:00:00 2001 From: Sasikantha babu Date: Wed, 21 Mar 2012 20:10:54 +0530 Subject: itimer: Schedule silent NULL pointer fixup in setitimer() for removal setitimer() should return -EFAULT if called with an invalid pointer for value. The current code excludes a NULL pointer from this rule and silently uses it to stop the timer. This violates the spec. Warn about user space apps which rely on that feature and schedule it for removal. [ tglx: Massaged changelog, warn message and Doc entry ] Signed-off-by: Sasikantha babu Link: http://lkml.kernel.org/r/1332340854-26053-1-git-send-email-sasikanth.v19@gmail.com Signed-off-by: Thomas Gleixner --- kernel/itimer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 22000c3db0dd..c70369a74b5a 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -284,8 +284,11 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (value) { if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) return -EFAULT; - } else + } else { memset((char *) &set_buffer, 0, sizeof(set_buffer)); + WARN_ONCE(1, "setitimer: new_value pointer is NULL." + " Misfeature support will be removed\n"); + } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); if (error || !ovalue) -- cgit v1.2.2 From 3872c48b14259d8c0a00c9fff06a4a4123f7f4eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 31 Mar 2012 12:45:43 +0200 Subject: tick: Document TICK_ONESHOT config option This option has been selected from arch code as it was assumed that it's necessary to support oneshot mode clockevent devices. But it's just a core internal helper to compile tick-oneshot.c if NOHZ or HIG_RES_TIMERS are selected. Reported-by: Russell King Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2cf9cc7aa103..a20dc8a3c949 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,6 +1,10 @@ # # Timer subsystem related configuration options # + +# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this. config TICK_ONESHOT bool -- cgit v1.2.2 From 6f103929f8979d2638e58d7f7fda0beefcb8ee7e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 27 Mar 2012 15:09:37 -0400 Subject: nohz: Fix stale jiffies update in tick_nohz_restart() Fix tick_nohz_restart() to not use a stale ktime_t "now" value when calling tick_do_update_jiffies64(now). If we reach this point in the loop it means that we crossed a tick boundary since we grabbed the "now" timestamp, so at this point "now" refers to a time in the old jiffy, so using the old value for "now" is incorrect, and is likely to give us a stale jiffies value. In particular, the first time through the loop the tick_do_update_jiffies64(now) call is always a no-op, since the caller, tick_nohz_restart_sched_tick(), will have already called tick_do_update_jiffies64(now) with that "now" value. Note that tick_nohz_stop_sched_tick() already uses the correct approach: when we notice we cross a jiffy boundary, grab a new timestamp with ktime_get(), and *then* update jiffies. Signed-off-by: Neal Cardwell Cc: Ben Segall Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1332875377-23014-1-git-send-email-ncardwell@google.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3526038f2836..6a3a5b9ff561 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_get_expires(&ts->sched_timer), 0)) break; } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); + /* Reread time and update jiffies */ now = ktime_get(); + tick_do_update_jiffies64(now); } } -- cgit v1.2.2 From 9886f444129171569461d8c39983e16f4871e3b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Apr 2012 10:50:55 +0200 Subject: itimer: Use printk_once instead of WARN_ONCE David pointed out, that WARN_ONCE() to report usage of an deprecated misfeature make folks unhappy. Use printk_once() instead. Andrew told me to stop grumbling and to remove the silly typecast while touching the file. Reported-by: David Rientjes Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/itimer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index c70369a74b5a..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -285,9 +285,10 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) return -EFAULT; } else { - memset((char *) &set_buffer, 0, sizeof(set_buffer)); - WARN_ONCE(1, "setitimer: new_value pointer is NULL." - " Misfeature support will be removed\n"); + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); -- cgit v1.2.2 From fa4da365bc7772c2cd6d5405bdf151612455f957 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 9 Apr 2012 15:41:44 -0700 Subject: clockevents: tTack broadcast device mode change in tick_broadcast_switch_to_oneshot() In the commit 77b0d60c5adf39c74039e2142a1d3cd1e4d53799, "clockevents: Leave the broadcast device in shutdown mode when not needed", we were bailing out too quickly in tick_broadcast_switch_to_oneshot(), with out tracking the broadcast device mode change to 'TICKDEV_MODE_ONESHOT'. This breaks the platforms which need broadcast device oneshot services during deep idle states. tick_broadcast_oneshot_control() thinks that it is in periodic mode and fails to take proper decisions based on the CLOCK_EVT_NOTIFY_BROADCAST_[ENTER, EXIT] notifications during deep idle entry/exit. Fix this by tracking the broadcast device mode as 'TICKDEV_MODE_ONESHOT', before leaving the broadcast HW device in shutdown mode if there are no active requests for the moment. Reported-and-tested-by: Santosh Shilimkar Signed-off-by: Suresh Siddha Cc: johnstul@us.ibm.com Link: http://lkml.kernel.org/r/1334011304.12400.81.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e883f57a3cd3..bf57abdc7bd0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,10 +575,12 @@ void tick_broadcast_switch_to_oneshot(void) unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + if (cpumask_empty(tick_get_broadcast_mask())) goto end; - tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); -- cgit v1.2.2 From 5b7526e3a640e491075557acaa842c59c652c0c3 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 5 Apr 2012 16:52:13 -0700 Subject: irq/irq_domain: Quit ignoring error returns from irq_alloc_desc_from(). In commit 4bbdd45a (irq_domain/powerpc: eliminate irq_map; use irq_alloc_desc() instead) code was added that ignores error returns from irq_alloc_desc_from() by (silently) casting the return value to unsigned. The negitive value error return now suddenly looks like a valid irq number. Commits cc79ca69 (irq_domain: Move irq_domain code from powerpc to kernel/irq) and 1bc04f2c (irq_domain: Add support for base irq and hwirq in legacy mappings) move this code to its current location in irqdomain.c The result of all of this is a null pointer dereference OOPS if one of the error cases is hit. The fix: Don't cast away the negativeness of the return value and then check for errors. Signed-off-by: David Daney Acked-by: Rob Herring [grant.likely: dropped addition of new 'irq' variable] Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3601f3fbf67c..9310a8d365b0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -350,7 +350,8 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int virq, hint; + unsigned int hint; + int virq; pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -381,9 +382,9 @@ unsigned int irq_create_mapping(struct irq_domain *domain, if (hint == 0) hint++; virq = irq_alloc_desc_from(hint, 0); - if (!virq) + if (virq <= 0) virq = irq_alloc_desc_from(1, 0); - if (!virq) { + if (virq <= 0) { pr_debug("irq: -> virq allocation failed\n"); return 0; } -- cgit v1.2.2 From ac5830a33f5b25eae1dc0708b3e7a3d270a6c07f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 10 Apr 2012 15:25:42 +0300 Subject: irq_domain: correct the debugfs file name The actual name of the irq_domain mapping debugfs file is "irq_domain_mapping" not "virq_mapping". Signed-off-by: Mika Westerberg Signed-off-by: Grant Likely --- kernel/irq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index cf1a4a68ce44..d1a758bc972a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -62,7 +62,7 @@ config IRQ_DOMAIN_DEBUG help This option will show the mapping relationship between hardware irq numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "virq_mapping". + in the file "irq_domain_mapping". If you don't know what this means you don't need it. -- cgit v1.2.2 From 15e06bf64f686befd2030da867a3dad965b96cc0 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 11 Apr 2012 00:26:25 -0600 Subject: irqdomain: Fix debugfs formatting This patch fixes the irq_domain_mapping debugfs output to pad pointer values with leading zeros so that pointer values are displayed correctly. Otherwise you get output similar to "0x 5e0000000000000". Also, when the irq_domain is set to 'null' Signed-off-by: Grant Likely Cc: David Daney Cc: Mika Westerberg --- kernel/irq/irqdomain.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9310a8d365b0..eb05e40f4553 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -643,8 +643,8 @@ static int virq_debug_show(struct seq_file *m, void *private) void *data; int i; - seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", - "chip name", "chip data", "domain name"); + seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + "chip name", 2 * sizeof(void *) + 2, "chip data", "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -667,7 +667,7 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "%-15s ", p); data = irq_desc_get_chip_data(desc); - seq_printf(m, "0x%16p ", data); + seq_printf(m, data ? "0x%p " : " %p ", data); if (desc->irq_data.domain && desc->irq_data.domain->of_node) p = desc->irq_data.domain->of_node->full_name; -- cgit v1.2.2 From 79549c6dfda0603dba9a70a53467ce62d9335c33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 9 Apr 2012 21:03:50 +0200 Subject: cred: copy_process() should clear child->replacement_session_keyring keyctl_session_to_parent(task) sets ->replacement_session_keyring, it should be processed and cleared by key_replace_session_keyring(). However, this task can fork before it notices TIF_NOTIFY_RESUME and the new child gets the bogus ->replacement_session_keyring copied by dup_task_struct(). This is obviously wrong and, if nothing else, this leads to put_cred(already_freed_cred). change copy_creds() to clear this member. If copy_process() fails before this point the wrong ->replacement_session_keyring doesn't matter, exit_creds() won't be called. Cc: Signed-off-by: Oleg Nesterov Acked-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 97b36eeca4c9..e70683d9ec32 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -386,6 +386,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; + p->replacement_session_keyring = NULL; + if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && -- cgit v1.2.2 From 6fa6c8e25e95bdc73e92e4c96b8e3299169b616e Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 15 Feb 2012 15:06:08 -0700 Subject: irq_domain: Move irq_virq_count into NOMAP revmap This patch replaces the old global setting of irq_virq_count that is only used by the NOMAP mapping and instead uses a revmap_data property so that the maximum NOMAP allocation can be set per NOMAP irq_domain. There is exactly one user of irq_virq_count in-tree right now: PS3. Also, irq_virq_count is only useful for the NOMAP mapping. So, instead of having a single global irq_virq_count values, this change drops it entirely and added a max_irq argument to irq_domain_add_nomap(). That makes it a property of an individual nomap irq domain instead of a global system settting. Signed-off-by: Grant Likely Tested-by: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller --- kernel/irq/irqdomain.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb05e40f4553..d34413e78628 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,7 +23,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); -static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; /** @@ -184,13 +183,16 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, } struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + unsigned int max_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_NOMAP, ops, host_data); - if (domain) + if (domain) { + domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); + } return domain; } @@ -262,22 +264,6 @@ void irq_set_default_host(struct irq_domain *domain) irq_default_domain = domain; } -/** - * irq_set_virq_count() - Set the maximum number of linux irqs - * @count: number of linux irqs, capped with NR_IRQS - * - * This is mainly for use by platforms like iSeries who want to program - * the virtual irq number in the controller to avoid the reverse mapping - */ -void irq_set_virq_count(unsigned int count) -{ - pr_debug("irq: Trying to set virq count to %d\n", count); - - BUG_ON(count < NUM_ISA_INTERRUPTS); - if (count < NR_IRQS) - irq_virq_count = count; -} - static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { @@ -320,13 +306,12 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) pr_debug("irq: create_direct virq allocation failed\n"); return 0; } - if (virq >= irq_virq_count) { + if (virq >= domain->revmap_data.nomap.max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - irq_virq_count); + domain->revmap_data.nomap.max_irq); irq_free_desc(virq); return 0; } - pr_debug("irq: create_direct obtained virq %d\n", virq); if (irq_setup_virq(domain, virq, virq)) { @@ -378,7 +363,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return irq_domain_legacy_revmap(domain, hwirq); /* Allocate a virtual interrupt number */ - hint = hwirq % irq_virq_count; + hint = hwirq % nr_irqs; if (hint == 0) hint++; virq = irq_alloc_desc_from(hint, 0); @@ -516,7 +501,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int i; - unsigned int hint = hwirq % irq_virq_count; + unsigned int hint = hwirq % nr_irqs; /* Look for default domain if nececssary */ if (domain == NULL) @@ -537,7 +522,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (data && (data->domain == domain) && (data->hwirq == hwirq)) return i; i++; - if (i >= irq_virq_count) + if (i >= nr_irqs) i = 1; } while(i != hint); return 0; -- cgit v1.2.2 From 026ee1f66aaa7f01b617a0ba89ac4b531f9603f1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 12 Apr 2012 12:49:17 -0700 Subject: panic: fix stack dump print on direct call to panic() Commit 6e6f0a1f0fa6 ("panic: don't print redundant backtraces on oops") causes a regression where no stack trace will be printed at all for the case where kernel code calls panic() directly while not processing an oops, and of course there are 100's of instances of this type of call. The original commit executed the check (!oops_in_progress), but this will always be false because just before the dump_stack() there is a call to bust_spinlocks(1), which does the following: void __attribute__((weak)) bust_spinlocks(int yes) { if (yes) { ++oops_in_progress; The proper way to resolve the problem that original commit tried to solve is to avoid printing a stack dump from panic() when the either of the following conditions is true: 1) TAINT_DIE has been set (this is done by oops_end()) This indicates and oops has already been printed. 2) oops_in_progress > 1 This guards against the rare case where panic() is invoked a second time, or in between oops_begin() and oops_end() Signed-off-by: Jason Wessel Cc: Andi Kleen Cc: [3.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 80aed44e345a..8ed89a175d79 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -97,7 +97,7 @@ void panic(const char *fmt, ...) /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!oops_in_progress) + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) dump_stack(); #endif -- cgit v1.2.2 From 5269a9ab7def9a3116663347d59c4d70afa2d180 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 12 Apr 2012 14:42:15 -0600 Subject: irq_domain: fix type mismatch in debugfs output format sizeof(void*) returns an unsigned long, but it was being used as a width parameter to a "%-*s" format string which requires an int. On 64 bit platforms this causes a type mismatch: linux/kernel/irq/irqdomain.c:575: warning: field width should have type 'int', but argument 6 has type 'long unsigned int' This change casts the size to an int so printf gets the right data type. Reported-by: Andreas Schwab Signed-off-by: Grant Likely Cc: David Daney --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d34413e78628..0e0ba5f840b2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -629,7 +629,8 @@ static int virq_debug_show(struct seq_file *m, void *private) int i; seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", - "chip name", 2 * sizeof(void *) + 2, "chip data", "domain name"); + "chip name", (int)(2 * sizeof(void *) + 2), "chip data", + "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); -- cgit v1.2.2 From ef1f0982540e5f79c8bbf3675bbc0a9734dba3fc Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 11 Apr 2012 12:21:39 -0400 Subject: irq_work: fix compile failure on tile from missing include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with IRQ_WORK configured results in kernel/irq_work.c: In function ‘irq_work_run’: kernel/irq_work.c:110: error: implicit declaration of function ‘irqs_disabled’ The appropriate header just needs to be included. Signed-off-by: Chris Metcalf Signed-off-by: Paul Gortmaker --- kernel/irq_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 0c56d44b9fd5..1588e3b2871b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -11,6 +11,7 @@ #include #include #include +#include #include /* -- cgit v1.2.2 From 6e48b550d1f5f1919e6500547ae14a73fbf66c7b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 13 Apr 2012 09:52:59 +0100 Subject: tracing: Fix build breakage without CONFIG_PERF_EVENTS (again) Today's -next fails to link for me: kernel/built-in.o:(.data+0x178e50): undefined reference to `perf_ftrace_event_register' It looks like multiple fixes have been merged for the issue fixed by commit fa73dc9 (tracing: Fix build breakage without CONFIG_PERF_EVENTS) though I can't identify the other changes that have gone in at the minute, it's possible that the changes which caused the breakage fixed by the previous commit got dropped but the fix made it in. Link: http://lkml.kernel.org/r/1334307179-21255-1-git-send-email-broonie@opensource.wolfsonmicro.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Mark Brown Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 95059f091a24..f95d65da6db8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -836,11 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[]; filter) #include "trace_entries.h" -#ifdef CONFIG_FUNCTION_TRACER +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) int perf_ftrace_event_register(struct ftrace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL -#endif /* CONFIG_FUNCTION_TRACER */ +#endif #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.2 From 348f0fc238efb441a28e7644c51f9fd3001b228a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Apr 2012 15:41:28 -0400 Subject: tracing: Fix regression with tracing_on The change to make tracing_on affect only the ftrace ring buffer, caused a bug where it wont affect any ring buffer. The problem was that the buffer of the trace_array was passed to the write function and not the trace array itself. The trace_array can change the buffer when running a latency tracer. If this happens, then the buffer being disabled may not be the buffer currently used by ftrace. This will cause the tracing_on file to become useless. The simple fix is to pass the trace_array to the write function instead of the buffer. Then the actual buffer may be changed. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed7b5d1e12f4..2a22255c1010 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4629,7 +4629,8 @@ static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ring_buffer *buffer = filp->private_data; + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->buffer; char buf[64]; int r; @@ -4647,7 +4648,8 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ring_buffer *buffer = filp->private_data; + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->buffer; unsigned long val; int ret; @@ -4734,7 +4736,7 @@ static __init int tracer_init_debugfs(void) &trace_clock_fops); trace_create_file("tracing_on", 0644, d_tracer, - global_trace.buffer, &rb_simple_fops); + &global_trace, &rb_simple_fops); #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, -- cgit v1.2.2 From 92c38702e98e58438c3760ebb279c40bbca8bd5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Apr 2012 12:12:09 -0700 Subject: rcu: Permit call_rcu() from CPU_DYING notifiers As of: 29494be71afe ("rcu,cleanup: simplify the code when cpu is dying") RCU adopts callbacks from the dying CPU in its CPU_DYING notifier, which means that any callbacks posted by later CPU_DYING notifiers are ignored until the CPU comes back online. A WARN_ON_ONCE() was added to __call_rcu() by: e56014000816 ("rcu: Simplify offline processing") to check for this condition. Although this condition did not trigger (at least as far as I know) during -next testing, it did recently trigger in mainline: https://lkml.org/lkml/2012/4/2/34 What is needed longer term is for RCU's CPU_DEAD notifier to adopt any callbacks that were posted by CPU_DYING notifiers, however, the Linux kernel has been running with this sort of thing happening for quite some time. So the only thing that qualifies as a regression is the WARN_ON_ONCE(), which this commit removes. Making RCU's CPU_DEAD notifier adopt callbacks posted by CPU_DYING notifiers is a topic for the 3.5 release of the Linux kernel. Reported-by: Sergey Senozhatsky Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922c..d0c5baf1ab18 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1820,7 +1820,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ -- cgit v1.2.2 From b435092f70ec5ebbfb6d075d5bf3c631b49a51de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Apr 2012 12:08:23 +0200 Subject: tick: Fix oneshot broadcast setup really Sven Joachim reported, that suspend/resume on rc3 trips over a NULL pointer dereference. Linus spotted the clockevent handler being NULL. commit fa4da365b(clockevents: tTack broadcast device mode change in tick_broadcast_switch_to_oneshot()) tried to fix a problem with the broadcast device setup, which was introduced in commit 77b0d60c5( clockevents: Leave the broadcast device in shutdown mode when not needed). The initial commit avoided to set up the broadcast device when no broadcast request bits were set, but that left the broadcast device disfunctional. In consequence deep idle states which need the broadcast device were not woken up. commit fa4da365b tried to fix that by initializing the state of the broadcast facility, but that missed the fact, that nothing initializes the event handler and some other state of the underlying clock event device. The fix is to revert both commits and make only the mode setting of the clock event device conditional on the state of active broadcast users. That initializes everything except the low level device mode, but this happens when the broadcast functionality is invoked by deep idle. Reported-and-tested-by: Sven Joachim Signed-off-by: Thomas Gleixner Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: Suresh Siddha Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1204181205540.2542@ionos --- kernel/time/tick-broadcast.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index bf57abdc7bd0..119aca5c6845 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -531,7 +531,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); /* Take the do_timer update */ tick_do_timer_cpu = cpu; @@ -549,6 +548,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) to_cpumask(tmpmask)); if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); tick_broadcast_init_next_event(to_cpumask(tmpmask), tick_next_period); tick_broadcast_set_event(tick_next_period, 1); @@ -577,15 +577,10 @@ void tick_broadcast_switch_to_oneshot(void) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; - - if (cpumask_empty(tick_get_broadcast_mask())) - goto end; - bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); -end: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.2 From b9a6a23566960d0dd3f51e2e68b472cd61911078 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Apr 2012 17:31:58 +0200 Subject: tick: Ensure that the broadcast device is initialized Santosh found another trap when we avoid to initialize the broadcast device in the switch_to_oneshot code. The broadcast device might be still in SHUTDOWN state when we actually need to use it. That obviously breaks, as set_next_event() is called on a shutdown device. This did not break on x86, but Suresh analyzed it: From the review, most likely on Sven's system we are force enabling the hpet using the pci quirk's method very late. And in this case, hpet_clockevent (which will be global_clock_event) handler can be null, specifically as this platform might not be using deeper c-states and using the reliable APIC timer. Prior to commit 'fa4da365bc7772c', that handler will be set to 'tick_handle_oneshot_broadcast' when we switch the broadcast timer to oneshot mode, even though we don't use it. Post commit 'fa4da365bc7772c', we stopped switching the broadcast mode to oneshot as this is not really needed and his platform's global_clock_event's handler will remain null. While on my SNB laptop, same is set to 'clockevents_handle_noop' because hpet gets enabled very early. (noop handler on my platform set when the early enabled hpet timer gets replaced by the lapic timer). But the commit 'fa4da365bc7772c' tracked the broadcast timer mode in the SW as oneshot, even though it didn't touch the HW timer. During resume however, tick_resume_broadcast() saw the SW broadcast mode as oneshot and actually programmed the broadcast device also into oneshot mode. So this triggered the null pointer de-reference after the hpet wraps around and depending on what the hpet counter is set to. On the normal platforms where hpet gets enabled early we should be seeing a spurious interrupt (in my SNB laptop I see one spurious interrupt after around 5 minutes ;) which is 32-bit hpet counter wraparound time), but that's a separate issue. Enforce the mode setting when trying to set an event. Reported-and-tested-by: Santosh Shilimkar Signed-off-by: Thomas Gleixner Acked-by: Suresh Siddha Cc: torvalds@linux-foundation.org Cc: svenjoac@gmx.de Cc: rjw@sisk.pl Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1204181723350.2542@ionos --- kernel/time/tick-broadcast.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 119aca5c6845..029531f3818c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -373,6 +373,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + return clockevents_program_event(bc, expires, force); } -- cgit v1.2.2 From a6371f80230eaaafd7eef7efeedaa9509bdc982d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 18 Apr 2012 19:27:39 -0700 Subject: tick: Fix the spurious broadcast timer ticks after resume During resume, tick_resume_broadcast() programs the broadcast timer in oneshot mode unconditionally. On the platforms where broadcast timer is not really required, this will generate spurious broadcast timer ticks upon resume. For example, on the always running apic timer platforms with HPET, I see spurious hpet tick once every ~5minutes (which is the 32-bit hpet counter wraparound time). Similar to boot time, during resume make the oneshot mode setting of the broadcast clock event device conditional on the state of active broadcast users. Signed-off-by: Suresh Siddha Tested-by: Santosh Shilimkar Tested-by: svenjoac@gmx.de Cc: torvalds@linux-foundation.org Cc: rjw@sisk.pl Link: http://lkml.kernel.org/r/1334802459.28674.209.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 029531f3818c..f113755695e2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -346,7 +346,8 @@ int tick_resume_broadcast(void) tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: - broadcast = tick_resume_broadcast_oneshot(bc); + if (!cpumask_empty(tick_get_broadcast_mask())) + broadcast = tick_resume_broadcast_oneshot(bc); break; } } -- cgit v1.2.2 From db4c75cbebd7e5910cd3bcb6790272fcc3042857 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Apr 2012 10:31:47 -0400 Subject: tracing: Fix stacktrace of latency tracers (irqsoff and friends) While debugging a latency with someone on IRC (mirage335) on #linux-rt (OFTC), we discovered that the stacktrace output of the latency tracers (preemptirqsoff) was empty. This bug was caused by the creation of the dynamic length stack trace again (like commit 12b5da3 "tracing: Fix ent_size in trace output" was). This bug is caused by the latency tracers requiring the next event to determine the time between the current event and the next. But by grabbing the next event, the iter->ent_size is set to the next event instead of the current one. As the stacktrace event is the last event, this makes the ent_size zero and causes nothing to be printed for the stack trace. The dynamic stacktrace uses the ent_size to determine how much of the stack can be printed. The ent_size of zero means no stack. The simple fix is to save the iter->ent_size before finding the next event. Note, mirage335 asked to remain anonymous from LKML and git, so I will not add the Reported-by and Tested-by tags, even though he did report the issue and tested the fix. Cc: stable@vger.kernel.org # 3.1+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 859fae6b1825..df611a0e76c5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; int ret; + /* trace_find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, @@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter) unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); unsigned long rel_usecs; + /* Restore the original ent_size */ + iter->ent_size = ent_size; + if (!next_entry) next_ts = iter->ts; rel_usecs = ns2usecs(next_ts - iter->ts); -- cgit v1.2.2 From 9f3045eca89a2e6fdd1901aafb9e28231d3f31fb Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 18 Apr 2012 16:29:57 -0400 Subject: irq: hide debug macros so they don't collide with others. The file kernel/irq/debug.h temporarily defines P, PS, PD and then undefines them. However these names aren't really "internal" enough, and collide with other more legit users such as the ones in the xtensa arch, causing: In file included from kernel/irq/internals.h:58:0, from kernel/irq/irqdesc.c:18: kernel/irq/debug.h:8:0: warning: "PS" redefined [enabled by default] arch/xtensa/include/asm/regs.h:59:0: note: this is the location of the previous definition Add a handful of underscores to do a better job of hiding these temporary macros. Cc: Thomas Gleixner Signed-off-by: Paul Gortmaker --- kernel/irq/debug.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 97a8bfadc88a..e75e29e4434a 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,10 +4,10 @@ #include -#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) -#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) +#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ -#define PD(f) do { } while (0) +#define ___PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_LEVEL); - P(IRQ_PER_CPU); - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOTHREAD); - P(IRQ_NOAUTOEN); + ___P(IRQ_LEVEL); + ___P(IRQ_PER_CPU); + ___P(IRQ_NOPROBE); + ___P(IRQ_NOREQUEST); + ___P(IRQ_NOTHREAD); + ___P(IRQ_NOAUTOEN); - PS(IRQS_AUTODETECT); - PS(IRQS_REPLAY); - PS(IRQS_WAITING); - PS(IRQS_PENDING); + ___PS(IRQS_AUTODETECT); + ___PS(IRQS_REPLAY); + ___PS(IRQS_WAITING); + ___PS(IRQS_PENDING); - PD(IRQS_INPROGRESS); - PD(IRQS_DISABLED); - PD(IRQS_MASKED); + ___PD(IRQS_INPROGRESS); + ___PD(IRQS_DISABLED); + ___PD(IRQS_MASKED); } -#undef P -#undef PS -#undef PD +#undef ___P +#undef ___PS +#undef ___PD -- cgit v1.2.2 From f8262d476823a7ea1eb497ff9676d1eab2393c75 Mon Sep 17 00:00:00 2001 From: Bojan Smojver Date: Tue, 24 Apr 2012 23:53:28 +0200 Subject: PM / Hibernate: fix the number of pages used for hibernate/thaw buffering Hibernation regression fix, since 3.2. Calculate the number of required free pages based on non-high memory pages only, because that is where the buffers will come from. Commit 081a9d043c983f161b78fdc4671324d1342b86bc introduced a new buffer page allocation logic during hibernation, in order to improve the performance. The amount of pages allocated was calculated based on total amount of pages available, although only non-high memory pages are usable for this purpose. This caused hibernation code to attempt to over allocate pages on platforms that have high memory, which led to hangs. Signed-off-by: Bojan Smojver Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8742fd013a94..eef311a58a64 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -51,6 +51,23 @@ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) +/* + * Number of free pages that are not high. + */ +static inline unsigned long low_free_pages(void) +{ + return nr_free_pages() - nr_free_highpages(); +} + +/* + * Number of pages required to be kept free while writing the image. Always + * half of all available low pages before the writing starts. + */ +static inline unsigned long reqd_free_pages(void) +{ + return low_free_pages() / 2; +} + struct swap_map_page { sector_t entries[MAP_PAGE_ENTRIES]; sector_t next_swap; @@ -72,7 +89,7 @@ struct swap_map_handle { sector_t cur_swap; sector_t first_sector; unsigned int k; - unsigned long nr_free_pages, written; + unsigned long reqd_free_pages; u32 crc32; }; @@ -316,8 +333,7 @@ static int get_swap_writer(struct swap_map_handle *handle) goto err_rel; } handle->k = 0; - handle->nr_free_pages = nr_free_pages() >> 1; - handle->written = 0; + handle->reqd_free_pages = reqd_free_pages(); handle->first_sector = handle->cur_swap; return 0; err_rel: @@ -352,11 +368,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, handle->cur_swap = offset; handle->k = 0; } - if (bio_chain && ++handle->written > handle->nr_free_pages) { + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { error = hib_wait_on_bio_chain(bio_chain); if (error) goto out; - handle->written = 0; + handle->reqd_free_pages = reqd_free_pages(); } out: return error; @@ -618,7 +634,7 @@ static int save_image_lzo(struct swap_map_handle *handle, * Adjust number of free pages after all allocations have been done. * We don't want to run out of pages when writing. */ - handle->nr_free_pages = nr_free_pages() >> 1; + handle->reqd_free_pages = reqd_free_pages(); /* * Start the CRC32 thread. -- cgit v1.2.2 From eb95308ee2a69403909e111837b9068c64cfc349 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Apr 2012 13:38:40 +0200 Subject: sched: Fix more load-balancing fallout Commits 367456c756a6 ("sched: Ditch per cgroup task lists for load-balancing") and 5d6523ebd ("sched: Fix load-balance wreckage") left some more wreckage. By setting loop_max unconditionally to ->nr_running load-balancing could take a lot of time on very long runqueues (hackbench!). So keep the sysctl as max limit of the amount of tasks we'll iterate. Furthermore, the min load filter for migration completely fails with cgroups since inequality in per-cpu state can easily lead to such small loads :/ Furthermore the change to add new tasks to the tail of the queue instead of the head seems to have some effect.. not quite sure I understand why. Combined these fixes solve the huge hackbench regression reported by Tim when hackbench is ran in a cgroup. Reported-by: Tim Chen Acked-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1335365763.28150.267.camel@twins [ got rid of the CONFIG_PREEMPT tuning and made small readability edits ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++-------- kernel/sched/features.h | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f0..e9553640c1c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3215,6 +3215,8 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); +static const unsigned int sched_nr_migrate_break = 32; + /* * move_tasks tries to move up to load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -3242,7 +3244,7 @@ static int move_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sysctl_sched_nr_migrate; + env->loop_break += sched_nr_migrate_break; env->flags |= LBF_NEED_BREAK; break; } @@ -3252,7 +3254,7 @@ static int move_tasks(struct lb_env *env) load = task_h_load(p); - if (load < 16 && !env->sd->nr_balance_failed) + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; if ((load / 2) > env->load_move) @@ -4407,7 +4409,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, - .loop_break = sysctl_sched_nr_migrate, + .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); @@ -4445,10 +4447,10 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.load_move = imbalance; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - env.loop_max = busiest->nr_running; + env.load_move = imbalance; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); more_balance: local_irq_save(flags); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d0..de00a486c5c6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) -- cgit v1.2.2 From fb2cf2c660971bea0ad86a9a5c19ad39eab61344 Mon Sep 17 00:00:00 2001 From: "he, bo" Date: Wed, 25 Apr 2012 19:59:21 +0800 Subject: sched: Fix OOPS when build_sched_domains() percpu allocation fails Under extreme memory used up situations, percpu allocation might fail. We hit it when system goes to suspend-to-ram, causing a kworker panic: EIP: [] build_sched_domains+0x23a/0xad0 Kernel panic - not syncing: Fatal exception Pid: 3026, comm: kworker/u:3 3.0.8-137473-gf42fbef #1 Call Trace: [] panic+0x66/0x16c [...] [] partition_sched_domains+0x287/0x4b0 [] cpuset_update_active_cpus+0x1fe/0x210 [] cpuset_cpu_inactive+0x1d/0x30 [...] With this fix applied build_sched_domains() will return -ENOMEM and the suspend attempt fails. Signed-off-by: he, bo Reviewed-by: Zhang, Yanmin Reviewed-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Link: http://lkml.kernel.org/r/1335355161.5892.17.camel@hebo [ So, we fail to deallocate a CPU because we cannot allocate RAM :-/ I don't like that kind of sad behavior but nevertheless it should not crash under high memory load. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4603b9d8f30a..0533a688ce22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6405,16 +6405,26 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgp) + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); + sdd->sd = NULL; free_percpu(sdd->sg); + sdd->sg = NULL; free_percpu(sdd->sgp); + sdd->sgp = NULL; } } -- cgit v1.2.2 From 724b6daa13e100067c30cfc4d1ad06629609dc4e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Apr 2012 11:54:13 +1000 Subject: perf: Fix perf_event_for_each() to use sibling In perf_event_for_each() we call a function on an event, and then iterate over the siblings of the event. However we don't call the function on the siblings, we call it repeatedly on the original event - it seems "obvious" that we should be calling it with sibling as the argument. It looks like this broke in commit 75f937f24bd9 ("Fix ctx->mutex vs counter->mutex inversion"). The only effect of the bug is that the PERF_IOC_FLAG_GROUP parameter to the ioctls doesn't work. Signed-off-by: Michael Ellerman Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334109253-31329-1-git-send-email-michael@ellerman.id.au Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a6a9ec4cd8f5..fd126f82b57c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3183,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(event, func); func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) - perf_event_for_each_child(event, func); + perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); } -- cgit v1.2.2