From 95e904c7da715aa2dbfb595da66b63de37a0bb04 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Sun, 11 May 2008 05:55:33 +0530
Subject: sched: fix defined-but-unused warning

Fix this warning, which appears with !CONFIG_SMP:
kernel/sched.c:1216: warning: `init_hrtick' defined but not used

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751e7612..04228524d160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1127,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+#ifdef CONFIG_SMP
 static void hotplug_hrtick_disable(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -1182,6 +1183,7 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
+#endif /* CONFIG_SMP */
 
 static void init_rq_hrtick(struct rq *rq)
 {
-- 
cgit v1.2.2


From 49307fd6f72bdd68cc2bd23e7da0bcfecf8087c9 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Wed, 18 Jun 2008 09:18:38 +0200
Subject: sched: NULL pointer dereference while setting sched_rt_period_us
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_RT_GROUP_SCHED and CONFIG_CGROUP_SCHED are enabled, with:

 echo 10000 > /proc/sys/kernel/sched_rt_period_us

We get this:

 BUG: unable to handle kernel NULL pointer dereference at 0000008c
 [  947.682233] IP: [<c0216b72>] __rt_schedulable+0x12/0x160
 [  947.683123] *pde = 00000000=20
 [  947.683782] Oops: 0000 [#1]
 [  947.684307] Modules linked in:
 [  947.684308]
 [  947.684308] Pid: 2359, comm: bash Not tainted (2.6.26-rc6 #8)
 [  947.684308] EIP: 0060:[<c0216b72>] EFLAGS: 00000246 CPU: 0
 [  947.684308] EIP is at __rt_schedulable+0x12/0x160
 [  947.684308] EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000001
 [  947.684308] ESI: c0521db4 EDI: 00000001 EBP: c6cc9f00 ESP: c6cc9ed0
 [  947.684308]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
 [  947.684308] Process bash (pid: 2359, tiÆcc8000 taskÇa54f00=20 task.tiÆcc8000)
 [  947.684308] Stack: c0222790 00000000 080f8c08 c0521db4 c6cc9f00 00000001 00000000 00000000
 [  947.684308]        c6cc9f9c 00000000 c0521db4 00000001 c6cc9f28 c0216d40 00000000 00000000
 [  947.684308]        c6cc9f9c 000f4240 000e7ef0 ffffffff c0521db4 c79dfb60 c6cc9f58 c02af2cc
 [  947.684308] Call Trace:
 [  947.684308]  [<c0222790>] ? do_proc_dointvec_conv+0x0/0x50
 [  947.684308]  [<c0216d40>] ? sched_rt_handler+0x80/0x110
 [  947.684308]  [<c02af2cc>] ? proc_sys_call_handler+0x9c/0xb0
 [  947.684308]  [<c02af2fa>] ? proc_sys_write+0x1a/0x20
 [  947.684308]  [<c0273c36>] ? vfs_write+0x96/0x160
 [  947.684308]  [<c02af2e0>] ? proc_sys_write+0x0/0x20
 [  947.684308]  [<c027423d>] ? sys_write+0x3d/0x70
 [  947.684308]  [<c0202ef5>] ? sysenter_past_esp+0x6a/0x91
 [  947.684308]  =======================
 [  947.684308] Code: 24 04 e8 62 b1 0e 00 89 c7 89 f8 8b 5d f4 8b 75
 f8 8b 7d fc 89 ec 5d c3 90 55 89 e5 57 56 53 83 ec 24 89 45 ec 89 55 e4
 89 4d e8 <8b> b8 8c 00 00 00 85 ff 0f 84 c9 00 00 00 8b 57 24 39 55 e8
 8b
 [  947.684308] EIP: [<c0216b72>] __rt_schedulable+0x12/0x160 SS:ESP  0068:c6cc9ed0

We think the following patch solves the issue.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 04228524d160..320e9a43d4cc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8350,7 +8350,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_group *tgi, *parent = tg->parent;
+	struct task_group *tgi, *parent = tg ? tg->parent : NULL;
 	unsigned long total = 0;
 
 	if (!parent) {
-- 
cgit v1.2.2


From 7ea56616ba6b3d67a4892728182e38ae162ea3e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 09:06:56 +0200
Subject: sched: rt-group: fix hierarchy

Don't re-set the entity's runqueue to the wrong rq after we've set it
to the right one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Daniel K. <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 320e9a43d4cc..ce375cb551e9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7628,7 +7628,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 	else
 		rt_se->rt_rq = parent->my_q;
 
-	rt_se->rt_rq = &rq->rt;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = parent;
 	INIT_LIST_HEAD(&rt_se->run_list);
-- 
cgit v1.2.2


From ad2a3f13b7258a5daaaeb8cff9f835aac468b71d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 09:06:57 +0200
Subject: sched: rt-group: heirarchy aware throttle

The bandwidth throttle code dequeues a group when it runs out of quota, and
re-queues it once the period rolls over and the quota gets refreshed.

Sadly it failed to take the hierarchy into consideration. Share more of the
enqueue/dequeue code with regular task opterations.

Also, some operations like sched_setscheduler() can dequeue/enqueue tasks that
are in throttled runqueues, we should not inadvertly re-enqueue empty runqueues
so check for that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Daniel K. <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 59 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3432d573205d..837241568d76 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -449,13 +449,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 #endif
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && rt_rq_throttled(group_rq))
+	/*
+	 * Don't enqueue the group if its throttled, or when empty.
+	 * The latter is a consequence of the former when a child group
+	 * get throttled and the current group doesn't have any other
+	 * active members.
+	 */
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -464,7 +470,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -480,11 +486,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct task_struct *p)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 {
-	struct sched_rt_entity *rt_se, *back = NULL;
+	struct sched_rt_entity *back = NULL;
 
-	rt_se = &p->rt;
 	for_each_sched_rt_entity(rt_se) {
 		rt_se->back = back;
 		back = rt_se;
@@ -492,7 +497,26 @@ static void dequeue_rt_stack(struct task_struct *p)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se);
+	}
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	dequeue_rt_stack(rt_se);
+	for_each_sched_rt_entity(rt_se)
+		__enqueue_rt_entity(rt_se);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	dequeue_rt_stack(rt_se);
+
+	for_each_sched_rt_entity(rt_se) {
+		struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+		if (rt_rq && rt_rq->rt_nr_running)
+			__enqueue_rt_entity(rt_se);
 	}
 }
 
@@ -506,32 +530,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	if (wakeup)
 		rt_se->timeout = 0;
 
-	dequeue_rt_stack(p);
-
-	/*
-	 * enqueue everybody, bottom - up.
-	 */
-	for_each_sched_rt_entity(rt_se)
-		enqueue_rt_entity(rt_se);
+	enqueue_rt_entity(rt_se);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
-	struct rt_rq *rt_rq;
 
 	update_curr_rt(rq);
-
-	dequeue_rt_stack(p);
-
-	/*
-	 * re-enqueue all non-empty rt_rq entities.
-	 */
-	for_each_sched_rt_entity(rt_se) {
-		rt_rq = group_rt_rq(rt_se);
-		if (rt_rq && rt_rq->rt_nr_running)
-			enqueue_rt_entity(rt_se);
-	}
+	dequeue_rt_entity(rt_se);
 }
 
 /*
-- 
cgit v1.2.2


From 15a8641eadb492ef7c5489faa25256967bdfd303 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 09:06:59 +0200
Subject: sched: rt-group: fix RR buglet

In tick_task_rt() we first call update_curr_rt() which can dequeue a runqueue
due to it running out of runtime, and then we try to requeue it, of it also
having exhausted its RR quota. Obviously requeueing something that is no longer
on the runqueue will not have the expected result.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Daniel K. <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 837241568d76..1dad5bbb59b6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -549,8 +549,10 @@ static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
 	struct rt_prio_array *array = &rt_rq->active;
+	struct list_head *queue = array->queue + rt_se_prio(rt_se);
 
-	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	if (on_rt_rq(rt_se))
+		list_move_tail(&rt_se->run_list, queue);
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
-- 
cgit v1.2.2


From f18f982abf183e91f435990d337164c7a43d1e6d Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Thu, 29 May 2008 11:17:01 -0700
Subject: sched: CPU hotplug events must not destroy scheduler domains created
 by the cpusets

First issue is not related to the cpusets. We're simply leaking doms_cur.
It's allocated in arch_init_sched_domains() which is called for every
hotplug event. So we just keep reallocation doms_cur without freeing it.
I introduced free_sched_domains() function that cleans things up.

Second issue is that sched domains created by the cpusets are
completely destroyed by the CPU hotplug events. For all CPU hotplug
events scheduler attaches all CPUs to the NULL domain and then puts
them all into the single domain thereby destroying domains created
by the cpusets (partition_sched_domains).
The solution is simple, when cpusets are enabled scheduler should not
create default domain and instead let cpusets do that. Which is
exactly what the patch does.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: menage@google.com
Cc: rostedt@goodmis.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpuset.c |  6 ++++++
 kernel/sched.c  | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 039baa4cd90c..bceb89557973 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1890,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void)
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 	scan_for_empty_cpusets(&top_cpuset);
 
+	/*
+	 * Scheduler destroys domains on hotplug events.
+	 * Rebuild them based on the current settings.
+	 */
+	rebuild_sched_domains();
+
 	cgroup_unlock();
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index ce375cb551e9..4a3cb0614158 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7237,6 +7237,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
+/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+	ndoms_cur = 0;
+	if (doms_cur != &fallback_doms)
+		kfree(doms_cur);
+	doms_cur = &fallback_doms;
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7384,6 +7396,7 @@ int arch_reinit_sched_domains(void)
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	detach_destroy_domains(&cpu_online_map);
+	free_sched_domains();
 	err = arch_init_sched_domains(&cpu_online_map);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
@@ -7469,6 +7482,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
+		free_sched_domains();
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -7487,8 +7501,16 @@ static int update_sched_domains(struct notifier_block *nfb,
 		return NOTIFY_DONE;
 	}
 
+#ifndef CONFIG_CPUSETS
+	/*
+	 * Create default domain partitioning if cpusets are disabled.
+	 * Otherwise we let cpusets rebuild the domains based on the
+	 * current setup.
+	 */
+
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
+#endif
 
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.2


From 30e0e178193d4221abc9926b07a4c7661c7cc4a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 13 May 2008 10:27:17 +0800
Subject: cpuset: limit the input of cpuset.sched_relax_domain_level

We allow the inputs to be [-1 ... SD_LV_MAX), and return -EINVAL
for inputs outside this range.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpuset.c | 4 ++--
 kernel/sched.c  | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 039baa4cd90c..66103a119bfe 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1037,8 +1037,8 @@ int current_cpuset_is_being_rebound(void)
 
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
-	if ((int)val < 0)
-		val = -1;
+	if (val < -1 || val >= SD_LV_MAX)
+		return -EINVAL;
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751e7612..bb2c699c9f5e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6877,7 +6877,12 @@ static int default_relax_domain_level = -1;
 
 static int __init setup_relax_domain_level(char *str)
 {
-	default_relax_domain_level = simple_strtoul(str, NULL, 0);
+	unsigned long val;
+
+	val = simple_strtoul(str, NULL, 0);
+	if (val < SD_LV_MAX)
+		default_relax_domain_level = val;
+
 	return 1;
 }
 __setup("relax_domain_level=", setup_relax_domain_level);
-- 
cgit v1.2.2


From afd38009cc3acd36d41f349a669ad5825d695b1f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 14:18:17 -0400
Subject: rcupreempt: remove export of rcu_batches_completed_bh

In rcupreempt, rcu_batches_completed_bh is defined as a static inline in
the header file. This does not need to be exported, and not only that,
this breaks my PPC build.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: paulus@samba.org
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcupreempt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..5e02b7740702 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -217,8 +217,6 @@ long rcu_batches_completed(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
 void __rcu_read_lock(void)
 {
 	int idx;
-- 
cgit v1.2.2


From 9c106c119ebedf624fbd682fd2a4d52e3c8c1a67 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Tue, 27 May 2008 12:23:29 -0500
Subject: softlockup: fix NMI hangs due to lock race - 2.6.26-rc regression

The touch_nmi_watchdog() routine on x86 ultimately calls
touch_softlockup_watchdog().  The problem is that to touch the
softlockup watchdog, the cpu_clock code has to be called which could
involve multiple cpu locks and can lead to a hard hang if one of the
locks is held by a processor that is not going to return anytime soon
(such as could be the case with kgdb or perhaps even with some other
kind of exception).

This patch causes the public version of the
touch_softlockup_watchdog() to defer the cpu clock access to a later
point.

The test case for this problem is to use the following kernel config
options:

CONFIG_KGDB_TESTS=y
CONFIG_KGDB_TESTS_ON_BOOT=y
CONFIG_KGDB_TESTS_BOOT_STRING="V1F100I100000"

It should be noted that kgdb test suite and these options were not
available until 2.6.26-rc2, so it was necessary to patch the kgdb
test suite during the bisection.

I would consider this patch a regression fix because the problem first
appeared in commit 27ec4407790d075c325e1f4da0a19c56953cce23 when some
logic was added to try to periodically sync the clocks.  It was
possible to work around this particular problem by simply not
performing the sync anytime the system was in a critical context.
This was ok until commit 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd,
which added config option CONFIG_HAVE_UNSTABLE_SCHED_CLOCK and some
multi-cpu locks to sync the clocks.  It became clear that accessing
this code from an nmi was the source of the lockups.  Avoiding the
access to the low level clock code from an code inside the NMI
processing also fixed the problem with the 27ec44... commit.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 01b6522fd92b..c828c2339cc9 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -49,12 +49,17 @@ static unsigned long get_timestamp(int this_cpu)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-void touch_softlockup_watchdog(void)
+static void __touch_softlockup_watchdog(void)
 {
 	int this_cpu = raw_smp_processor_id();
 
 	__raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
 }
+
+void touch_softlockup_watchdog(void)
+{
+	__raw_get_cpu_var(touch_timestamp) = 0;
+}
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 void touch_all_softlockup_watchdogs(void)
@@ -80,7 +85,7 @@ void softlockup_tick(void)
 	unsigned long now;
 
 	if (touch_timestamp == 0) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		return;
 	}
 
@@ -95,7 +100,7 @@ void softlockup_tick(void)
 
 	/* do not print during early bootup: */
 	if (unlikely(system_state != SYSTEM_RUNNING)) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		return;
 	}
 
@@ -214,7 +219,7 @@ static int watchdog(void *__bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	/* initialize timestamp */
-	touch_softlockup_watchdog();
+	__touch_softlockup_watchdog();
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	/*
@@ -223,7 +228,7 @@ static int watchdog(void *__bind_cpu)
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		schedule();
 
 		if (kthread_should_stop())
-- 
cgit v1.2.2


From d4abc238c9f4df8b3216f3e883f5d0a07b7ac75a Mon Sep 17 00:00:00 2001
From: Bharath Ravi <bharathravi1@gmail.com>
Date: Mon, 16 Jun 2008 15:11:01 +0530
Subject: sched, delay accounting: fix incorrect delay time when constantly
 waiting on runqueue

This patch corrects the incorrect value of per process run-queue wait
time reported by delay statistics. The anomaly was due to the following
reason. When a process leaves the CPU and immediately starts waiting for
CPU on the runqueue (which means it remains in the TASK_RUNNABLE state),
the time of re-entry into the run-queue is never recorded. Due to this,
the waiting time on the runqueue from this point of re-entry upto the
next time it hits the CPU is not accounted for. This is solved by
recording the time of re-entry of a process leaving the CPU in the
sched_info_depart() function IF the process will go back to waiting on
the run-queue. This IF condition is verified by checking whether the
process is still in the TASK_RUNNABLE state.

The patch was tested on 2.6.26-rc6 using two simple CPU hog programs.
The values noted prior to the fix did not account for the time spent on
the runqueue waiting. After the fix, the correct values were reported
back to user space.

Signed-off-by: Bharath Ravi <bharathravi1@gmail.com>
Signed-off-by: Madhava K R  <madhavakr@gmail.com>
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@in.ibm.com
Cc: balbir@in.ibm.com
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_stats.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index a38878e0e49d..80179ef7450e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -198,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t)
 /*
  * Called when a process ceases being the active-running process, either
  * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
@@ -206,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t)
 
 	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
+
+	if (t->state == TASK_RUNNING)
+		sched_info_queued(t);
 }
 
 /*
-- 
cgit v1.2.2