From 7378547f2c83ca16a30d0a7c488a43a688ea0888 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Wed, 24 Oct 2007 18:23:48 +0200
Subject: sched: fix sched_domain sysctl registration again

commit  029190c515f15f512ac85de8fc686d4dbd0ae731 (cpuset
sched_load_balance flag) was not tested SCHED_DEBUG enabled as
committed as it dereferences NULL when used and it reordered
the sysctl registration to cause it to never show any domains
or their tunables.

Fixes:

1) restore arch_init_sched_domains ordering
	we can't walk the domains before we build them

	presently we register cpus with empty directories (no domain
	directories or files).

2) make unregister_sched_domain_sysctl do nothing when already unregistered
	detach_destroy_domains is now called one set of cpus at a time
	unregister_syctl dereferences NULL if called with a null.

	While the the function would always dereference null if called
	twice, in the previous code it was always called once and then
	was followed a register.  So only the hidden bug of the
	sysctl_root_table not being allocated followed by an attempt to
	free it would have shown the error.

3) always call unregister and register in partition_sched_domains
	The code is "smart" about unregistering only needed domains.
	Since we aren't guaranteed any calls to unregister, always
	unregister.   Without calling register on the way out we
	will not have a table or any sysctl tree.

4) warn if register is called without unregistering
	The previous table memory is lost, leaving pointers to the
	later freed memory in sysctl and leaking the memory of the
	tables.

Before this patch on a 2-core 4-thread box compiled for SMT and NUMA,
the domains appear empty (there are actually 3 levels per cpu).  And as
soon as two domains a null pointer is dereferenced (unreliable in this
case is stack garbage):

bu19a:~# ls -R /proc/sys/kernel/sched_domain/
/proc/sys/kernel/sched_domain/:
cpu0  cpu1  cpu2  cpu3

/proc/sys/kernel/sched_domain/cpu0:

/proc/sys/kernel/sched_domain/cpu1:

/proc/sys/kernel/sched_domain/cpu2:

/proc/sys/kernel/sched_domain/cpu3:

bu19a:~# mkdir /dev/cpuset
bu19a:~# mount -tcpuset cpuset /dev/cpuset/
bu19a:~# cd /dev/cpuset/
bu19a:/dev/cpuset# echo 0 > sched_load_balance
bu19a:/dev/cpuset# mkdir one
bu19a:/dev/cpuset# echo 1 > one/cpus
bu19a:/dev/cpuset# echo 0 > one/sched_load_balance
Unable to handle kernel paging request for data at address 0x00000018
Faulting instruction address: 0xc00000000006b608
NIP: c00000000006b608 LR: c00000000006b604 CTR: 0000000000000000
REGS: c000000018d973f0 TRAP: 0300   Not tainted  (2.6.23-bml)
MSR: 9000000000009032 <EE,ME,IR,DR>  CR: 28242442  XER: 00000000
DAR: 0000000000000018, DSISR: 0000000040000000
TASK = c00000001912e340[1987] 'bash' THREAD: c000000018d94000 CPU: 2
..
NIP [c00000000006b608] .unregister_sysctl_table+0x38/0x110
LR [c00000000006b604] .unregister_sysctl_table+0x34/0x110
Call Trace:
[c000000018d97670] [c000000007017270] 0xc000000007017270 (unreliable)
[c000000018d97720] [c000000000058710] .detach_destroy_domains+0x30/0xb0
[c000000018d977b0] [c00000000005cf1c] .partition_sched_domains+0x1bc/0x230
[c000000018d97870] [c00000000009fdc4] .rebuild_sched_domains+0xb4/0x4c0
[c000000018d97970] [c0000000000a02e8] .update_flag+0x118/0x170
[c000000018d97a80] [c0000000000a1768] .cpuset_common_file_write+0x568/0x820
[c000000018d97c00] [c00000000009d95c] .cgroup_file_write+0x7c/0x180
[c000000018d97cf0] [c0000000000e76b8] .vfs_write+0xe8/0x1b0
[c000000018d97d90] [c0000000000e810c] .sys_write+0x4c/0x90
[c000000018d97e30] [c00000000000852c] syscall_exit+0x0/0x40

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 2810e562a991..e51f0eabfef2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5461,11 +5461,12 @@ static void register_sched_domain_sysctl(void)
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
 	if (entry == NULL)
 		return;
 
-	sd_ctl_dir[0].child = entry;
-
 	for_each_online_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
@@ -5473,14 +5474,19 @@ static void register_sched_domain_sysctl(void)
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
+
+	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 
+/* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
-	unregister_sysctl_table(sd_sysctl_header);
+	if (sd_sysctl_header)
+		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
-	sd_free_ctl_entry(&sd_ctl_dir[0].child);
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
@@ -6424,13 +6430,17 @@ static cpumask_t fallback_doms;
  */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
+	int err;
+
 	ndoms_cur = 1;
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
-	return build_sched_domains(doms_cur);
+
+	return err;
 }
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6479,6 +6489,9 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
 	int i, j;
 
+	/* always unregister in case we don't destroy any domains */
+	unregister_sched_domain_sysctl();
+
 	if (doms_new == NULL) {
 		ndoms_new = 1;
 		doms_new = &fallback_doms;
@@ -6514,6 +6527,8 @@ match2:
 		kfree(doms_cur);
 	doms_cur = doms_new;
 	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-- 
cgit v1.2.2


From b15136e9497ef5d6e08cf665e0d0acf7a229f6dc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 24 Oct 2007 18:23:48 +0200
Subject: sched: fix fastcall mismatch in completion APIs

Jeff Dike noticed that wait_for_completion_interruptible()'s prototype
had a mismatched fastcall.

Fix this by removing the fastcall attributes from all the completion APIs.

Found-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/completion.h | 18 +++++++++---------
 kernel/sched.c             | 10 +++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 268c5a4a2bd4..33d6aaf94447 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -42,15 +42,15 @@ static inline void init_completion(struct completion *x)
 	init_waitqueue_head(&x->wait);
 }
 
-extern void FASTCALL(wait_for_completion(struct completion *));
-extern int FASTCALL(wait_for_completion_interruptible(struct completion *x));
-extern unsigned long FASTCALL(wait_for_completion_timeout(struct completion *x,
-						   unsigned long timeout));
-extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
-			struct completion *x, unsigned long timeout));
-
-extern void FASTCALL(complete(struct completion *));
-extern void FASTCALL(complete_all(struct completion *));
+extern void wait_for_completion(struct completion *);
+extern int wait_for_completion_interruptible(struct completion *x);
+extern unsigned long wait_for_completion_timeout(struct completion *x,
+						   unsigned long timeout);
+extern unsigned long wait_for_completion_interruptible_timeout(
+			struct completion *x, unsigned long timeout);
+
+extern void complete(struct completion *);
+extern void complete_all(struct completion *);
 
 #define INIT_COMPLETION(x)	((x).done = 0)
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e51f0eabfef2..80edf29fa27c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3820,7 +3820,7 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
-void fastcall complete(struct completion *x)
+void complete(struct completion *x)
 {
 	unsigned long flags;
 
@@ -3832,7 +3832,7 @@ void fastcall complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
 
-void fastcall complete_all(struct completion *x)
+void complete_all(struct completion *x)
 {
 	unsigned long flags;
 
@@ -3884,13 +3884,13 @@ wait_for_common(struct completion *x, long timeout, int state)
 	return timeout;
 }
 
-void fastcall __sched wait_for_completion(struct completion *x)
+void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 
-unsigned long fastcall __sched
+unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
@@ -3906,7 +3906,7 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 
-unsigned long fastcall __sched
+unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
-- 
cgit v1.2.2


From 4dcf6aff023d9934630fb3649284951831c51f8f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 24 Oct 2007 18:23:48 +0200
Subject: sched: clean up sched_domain_debug()

clean up sched_domain_debug().

this also shrinks the code a bit:

   text    data     bss     dec     hex filename
  50474    4306     480   55260    d7dc sched.o.before
  50404    4306     480   55190    d796 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 146 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 80edf29fa27c..af02a4de069b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5617,101 +5617,101 @@ int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
 
 #ifdef CONFIG_SCHED_DEBUG
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 {
-	int level = 0;
+	struct sched_group *group = sd->groups;
+	cpumask_t groupmask;
+	char str[NR_CPUS];
 
-	if (!sd) {
-		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-		return;
+	cpumask_scnprintf(str, NR_CPUS, sd->span);
+	cpus_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
 	}
 
-	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+	printk(KERN_CONT "span %s\n", str);
+
+	if (!cpu_isset(cpu, sd->span)) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+	if (!cpu_isset(cpu, group->cpumask)) {
+		printk(KERN_ERR "ERROR: domain->groups does not contain"
+				" CPU%d\n", cpu);
+	}
 
+	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
-		int i;
-		char str[NR_CPUS];
-		struct sched_group *group = sd->groups;
-		cpumask_t groupmask;
-
-		cpumask_scnprintf(str, NR_CPUS, sd->span);
-		cpus_clear(groupmask);
-
-		printk(KERN_DEBUG);
-		for (i = 0; i < level + 1; i++)
-			printk(" ");
-		printk("domain %d: ", level);
-
-		if (!(sd->flags & SD_LOAD_BALANCE)) {
-			printk("does not load-balance\n");
-			if (sd->parent)
-				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-						" has parent");
+		if (!group) {
+			printk("\n");
+			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 
-		printk("span %s\n", str);
+		if (!group->__cpu_power) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: domain->cpu_power not "
+					"set\n");
+			break;
+		}
 
-		if (!cpu_isset(cpu, sd->span))
-			printk(KERN_ERR "ERROR: domain->span does not contain "
-					"CPU%d\n", cpu);
-		if (!cpu_isset(cpu, group->cpumask))
-			printk(KERN_ERR "ERROR: domain->groups does not contain"
-					" CPU%d\n", cpu);
+		if (!cpus_weight(group->cpumask)) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: empty group\n");
+			break;
+		}
 
-		printk(KERN_DEBUG);
-		for (i = 0; i < level + 2; i++)
-			printk(" ");
-		printk("groups:");
-		do {
-			if (!group) {
-				printk("\n");
-				printk(KERN_ERR "ERROR: group is NULL\n");
-				break;
-			}
+		if (cpus_intersects(groupmask, group->cpumask)) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: repeated CPUs\n");
+			break;
+		}
 
-			if (!group->__cpu_power) {
-				printk(KERN_CONT "\n");
-				printk(KERN_ERR "ERROR: domain->cpu_power not "
-						"set\n");
-				break;
-			}
+		cpus_or(groupmask, groupmask, group->cpumask);
 
-			if (!cpus_weight(group->cpumask)) {
-				printk(KERN_CONT "\n");
-				printk(KERN_ERR "ERROR: empty group\n");
-				break;
-			}
+		cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+		printk(KERN_CONT " %s", str);
 
-			if (cpus_intersects(groupmask, group->cpumask)) {
-				printk(KERN_CONT "\n");
-				printk(KERN_ERR "ERROR: repeated CPUs\n");
-				break;
-			}
+		group = group->next;
+	} while (group != sd->groups);
+	printk(KERN_CONT "\n");
 
-			cpus_or(groupmask, groupmask, group->cpumask);
+	if (!cpus_equal(sd->span, groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 
-			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
-			printk(KERN_CONT " %s", str);
+	if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
 
-			group = group->next;
-		} while (group != sd->groups);
-		printk(KERN_CONT "\n");
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
 
-		if (!cpus_equal(sd->span, groupmask))
-			printk(KERN_ERR "ERROR: groups don't span "
-					"domain->span\n");
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
 
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level))
+			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
-			continue;
-
-		if (!cpus_subset(groupmask, sd->span))
-			printk(KERN_ERR "ERROR: parent span is not a superset "
-				"of domain->span\n");
-
-	} while (sd);
+			break;
+	}
 }
 #else
 # define sched_domain_debug(sd, cpu) do { } while (0)
-- 
cgit v1.2.2


From 838225b48edc971620cbeb292034dabd2b0d7d1d Mon Sep 17 00:00:00 2001
From: Satyam Sharma <satyam@infradead.org>
Date: Wed, 24 Oct 2007 18:23:50 +0200
Subject: sched: use show_regs() to improve __schedule_bug() output

A full register dump along with stack backtrace would make the
"scheduling while atomic" message more helpful. Use show_regs() instead
of dump_stack() for this. We already know we're atomic in here (that is
why this function was called) so show_regs()'s atomicity expectations
are guaranteed.

Also, modify the output of the "BUG: scheduling while atomic:" header a
bit to keep task->comm and task->pid together and preempt_count() after
them.

Signed-off-by: Satyam Sharma <satyam@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index af02a4de069b..d1e6663d3ab0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,7 @@
 #include <linux/pagemap.h>
 
 #include <asm/tlb.h>
+#include <asm/irq_regs.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -3507,12 +3508,19 @@ EXPORT_SYMBOL(sub_preempt_count);
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
-		prev->comm, preempt_count(), task_pid_nr(prev));
+	struct pt_regs *regs = get_irq_regs();
+
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+		prev->comm, prev->pid, preempt_count());
+
 	debug_show_held_locks(prev);
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-	dump_stack();
+
+	if (regs)
+		show_regs(regs);
+	else
+		dump_stack();
 }
 
 /*
-- 
cgit v1.2.2


From b3da2a73ff5a2953a4ad8ebbf0aa7e6965ff9de2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 24 Oct 2007 18:23:50 +0200
Subject: sched: document profile=sleep requiring CONFIG_SCHEDSTATS

profile=sleep only works if CONFIG_SCHEDSTATS is set. This patch notes
the limitation in Documentation/kernel-parameters.txt and prints a
warning at boot-time if profile=sleep is used without CONFIG_SCHEDSTAT.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt | 3 ++-
 kernel/profile.c                    | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a13d69b2217d..8ae5fac08dfa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1444,7 +1444,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			Param: "schedule" - profile schedule points.
 			Param: <number> - step/bucket size as a power of 2 for
 				statistical time based profiling.
-			Param: "sleep" - profile D-state sleeping (millisecs)
+			Param: "sleep" - profile D-state sleeping (millisecs).
+				Requires CONFIG_SCHEDSTATS
 			Param: "kvm" - profile VM exits.
 
 	processor.max_cstate=	[HW,ACPI]
diff --git a/kernel/profile.c b/kernel/profile.c
index 631b75c25d7e..5e95330e5120 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -60,6 +60,7 @@ static int __init profile_setup(char * str)
 	int par;
 
 	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+#ifdef CONFIG_SCHEDSTATS
 		prof_on = SLEEP_PROFILING;
 		if (str[strlen(sleepstr)] == ',')
 			str += strlen(sleepstr) + 1;
@@ -68,6 +69,10 @@ static int __init profile_setup(char * str)
 		printk(KERN_INFO
 			"kernel sleep profiling enabled (shift: %ld)\n",
 			prof_shift);
+#else
+		printk(KERN_WARNING
+			"kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
+#endif /* CONFIG_SCHEDSTATS */
 	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
 		prof_on = SCHED_PROFILING;
 		if (str[strlen(schedstr)] == ',')
-- 
cgit v1.2.2


From a8972ccf00b7184a743eb6cd9bc7f3443357910c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 24 Oct 2007 18:23:50 +0200
Subject: sched: constify sched.h

Add const to some struct task_struct * uses

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13df99fb2769..52288a647692 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1196,7 +1196,7 @@ static inline int rt_prio(int prio)
 	return 0;
 }
 
-static inline int rt_task(struct task_struct *p)
+static inline int rt_task(const struct task_struct *p)
 {
 	return rt_prio(p->prio);
 }
@@ -1211,22 +1211,22 @@ static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
 	tsk->signal->__pgrp = pgrp;
 }
 
-static inline struct pid *task_pid(struct task_struct *task)
+static inline struct pid *task_pid(const struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
 }
 
-static inline struct pid *task_tgid(struct task_struct *task)
+static inline struct pid *task_tgid(const struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PID].pid;
 }
 
-static inline struct pid *task_pgrp(struct task_struct *task)
+static inline struct pid *task_pgrp(const struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PGID].pid;
 }
 
-static inline struct pid *task_session(struct task_struct *task)
+static inline struct pid *task_session(const struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_SID].pid;
 }
@@ -1255,7 +1255,7 @@ struct pid_namespace;
  * see also pid_nr() etc in include/linux/pid.h
  */
 
-static inline pid_t task_pid_nr(struct task_struct *tsk)
+static inline pid_t task_pid_nr(const struct task_struct *tsk)
 {
 	return tsk->pid;
 }
@@ -1268,7 +1268,7 @@ static inline pid_t task_pid_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
+static inline pid_t task_tgid_nr(const struct task_struct *tsk)
 {
 	return tsk->tgid;
 }
@@ -1281,7 +1281,7 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+static inline pid_t task_pgrp_nr(const struct task_struct *tsk)
 {
 	return tsk->signal->__pgrp;
 }
@@ -1294,7 +1294,7 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_session_nr(struct task_struct *tsk)
+static inline pid_t task_session_nr(const struct task_struct *tsk)
 {
 	return tsk->signal->__session;
 }
@@ -1321,7 +1321,7 @@ static inline pid_t task_ppid_nr_ns(struct task_struct *tsk,
  * If pid_alive fails, then pointers within the task structure
  * can be stale and must not be dereferenced.
  */
-static inline int pid_alive(struct task_struct *p)
+static inline int pid_alive(const struct task_struct *p)
 {
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
@@ -1332,7 +1332,7 @@ static inline int pid_alive(struct task_struct *p)
  *
  * Check if a task structure is the first user space task the kernel created.
  */
-static inline int is_global_init(struct task_struct *tsk)
+static inline int is_global_init(const struct task_struct *tsk)
 {
 	return tsk->pid == 1;
 }
@@ -1469,7 +1469,7 @@ extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
+static inline int rt_mutex_getprio(const struct task_struct *p)
 {
 	return p->normal_prio;
 }
@@ -1721,7 +1721,7 @@ extern void wait_task_inactive(struct task_struct * p);
  * all we care about is that we have a task with the appropriate
  * pid, we don't actually care if we have the right task.
  */
-static inline int has_group_leader_pid(struct task_struct *p)
+static inline int has_group_leader_pid(const struct task_struct *p)
 {
 	return p->pid == p->tgid;
 }
@@ -1738,7 +1738,7 @@ static inline struct task_struct *next_thread(const struct task_struct *p)
 			  struct task_struct, thread_group);
 }
 
-static inline int thread_group_empty(struct task_struct *p)
+static inline int thread_group_empty(const struct task_struct *p)
 {
 	return list_empty(&p->thread_group);
 }
-- 
cgit v1.2.2


From 2b01dfe37203e825edd8417ad3993d01cbbb527e Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 24 Oct 2007 18:23:50 +0200
Subject: sched: clean up some control group code

- replace "cont" with "cgrp" in a few places in the CFS cgroup code,
- use write_uint rather than write for cpu.shares write function

Signed-off-by: Paul Menage <menage@google.com>
Acked-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 53 ++++++++++++++++++-----------------------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d1e6663d3ab0..cc9cd5b710a6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7124,25 +7124,25 @@ unsigned long sched_group_shares(struct task_group *tg)
 #ifdef CONFIG_FAIR_CGROUP_SCHED
 
 /* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cont)
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
-	return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
-					 struct task_group, css);
+	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
+			    struct task_group, css);
 }
 
 static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg;
 
-	if (!cont->parent) {
+	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
-		init_task_group.css.cgroup = cont;
+		init_task_group.css.cgroup = cgrp;
 		return &init_task_group.css;
 	}
 
 	/* we support only 1-level deep hierarchical scheduler atm */
-	if (cont->parent->parent)
+	if (cgrp->parent->parent)
 		return ERR_PTR(-EINVAL);
 
 	tg = sched_create_group();
@@ -7150,21 +7150,21 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		return ERR_PTR(-ENOMEM);
 
 	/* Bind the cgroup to task_group object we just created */
-	tg->css.cgroup = cont;
+	tg->css.cgroup = cgrp;
 
 	return &tg->css;
 }
 
 static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
-					struct cgroup *cont)
+			       struct cgroup *cgrp)
 {
-	struct task_group *tg = cgroup_tg(cont);
+	struct task_group *tg = cgroup_tg(cgrp);
 
 	sched_destroy_group(tg);
 }
 
 static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
-			     struct cgroup *cont, struct task_struct *tsk)
+			     struct cgroup *cgrp, struct task_struct *tsk)
 {
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
@@ -7174,38 +7174,21 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
 }
 
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cont, struct task_struct *tsk)
 {
 	sched_move_task(tsk);
 }
 
-static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
-				struct file *file, const char __user *userbuf,
-				size_t nbytes, loff_t *ppos)
+static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+				u64 shareval)
 {
-	unsigned long shareval;
-	struct task_group *tg = cgroup_tg(cont);
-	char buffer[2*sizeof(unsigned long) + 1];
-	int rc;
-
-	if (nbytes > 2*sizeof(unsigned long))	/* safety check */
-		return -E2BIG;
-
-	if (copy_from_user(buffer, userbuf, nbytes))
-		return -EFAULT;
-
-	buffer[nbytes] = 0;	/* nul-terminate */
-	shareval = simple_strtoul(buffer, NULL, 10);
-
-	rc = sched_group_set_shares(tg, shareval);
-
-	return (rc < 0 ? rc : nbytes);
+	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
 
-static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
+static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
-	struct task_group *tg = cgroup_tg(cont);
+	struct task_group *tg = cgroup_tg(cgrp);
 
 	return (u64) tg->shares;
 }
@@ -7213,7 +7196,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
 static struct cftype cpu_shares = {
 	.name = "shares",
 	.read_uint = cpu_shares_read_uint,
-	.write = cpu_shares_write,
+	.write_uint = cpu_shares_write_uint,
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-- 
cgit v1.2.2


From a0f846aa76c3e03d54c1700a87cab3a46ccd71e2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 24 Oct 2007 18:23:50 +0200
Subject: sched: make cpu_shares_{show,store}() static

cpu_shares_{show,store}() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/user.c b/kernel/user.c
index e91331c457e2..0f3aa0234107 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -129,7 +129,7 @@ static inline void uids_mutex_unlock(void)
 }
 
 /* return cpu shares held by the user */
-ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
 {
 	struct user_struct *up = container_of(kset, struct user_struct, kset);
 
@@ -137,7 +137,8 @@ ssize_t cpu_shares_show(struct kset *kset, char *buffer)
 }
 
 /* modify cpu shares held by the user */
-ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
+				size_t size)
 {
 	struct user_struct *up = container_of(kset, struct user_struct, kset);
 	unsigned long shares;
-- 
cgit v1.2.2


From e1d1484f72127a5580d37c379f6a5b2c2786434c Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Wed, 24 Oct 2007 18:23:51 +0200
Subject: sched: reduce balance-tasks overhead

At the moment, balance_tasks() provides low level functionality for both
  move_tasks() and move_one_task() (indirectly) via the load_balance()
function (in the sched_class interface) which also provides dual
functionality.  This dual functionality complicates the interfaces and
internal mechanisms and makes the run time overhead of operations that
are called with two run queue locks held.

This patch addresses this issue and reduces the overhead of these
operations.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  7 +++-
 kernel/sched.c          | 99 +++++++++++++++++++++++++++++++++----------------
 kernel/sched_fair.c     | 44 ++++++++++++++++------
 kernel/sched_idletask.c | 14 +++++--
 kernel/sched_rt.c       | 28 +++++++++-----
 5 files changed, 135 insertions(+), 57 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52288a647692..639241f4f3d1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -829,11 +829,14 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
-			struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
+			struct rq *busiest, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
 
+	int (*move_one_task) (struct rq *this_rq, int this_cpu,
+			      struct rq *busiest, struct sched_domain *sd,
+			      enum cpu_idle_type idle);
+
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index cc9cd5b710a6..8607795fad69 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -838,11 +838,35 @@ struct rq_iterator {
 	struct task_struct *(*next)(void *);
 };
 
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int *this_best_prio, struct rq_iterator *iterator);
+#ifdef CONFIG_SMP
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct rq_iterator *iterator);
+
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle,
+		   struct rq_iterator *iterator);
+#else
+static inline unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct rq_iterator *iterator)
+{
+	return 0;
+}
+
+static inline int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle,
+		   struct rq_iterator *iterator)
+{
+	return 0;
+}
+#endif
 
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -2224,17 +2248,17 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	return 1;
 }
 
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int *this_best_prio, struct rq_iterator *iterator)
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct rq_iterator *iterator)
 {
 	int pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 
-	if (max_nr_move == 0 || max_load_move == 0)
+	if (max_load_move == 0)
 		goto out;
 
 	pinned = 1;
@@ -2267,7 +2291,7 @@ next:
 	 * We only want to steal up to the prescribed number of tasks
 	 * and the prescribed amount of weighted load.
 	 */
-	if (pulled < max_nr_move && rem_load_move > 0) {
+	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
 			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
@@ -2275,7 +2299,7 @@ next:
 	}
 out:
 	/*
-	 * Right now, this is the only place pull_task() is called,
+	 * Right now, this is one of only two places pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
@@ -2283,8 +2307,8 @@ out:
 
 	if (all_pinned)
 		*all_pinned = pinned;
-	*load_moved = max_load_move - rem_load_move;
-	return pulled;
+
+	return max_load_move - rem_load_move;
 }
 
 /*
@@ -2306,7 +2330,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
-				ULONG_MAX, max_load_move - total_load_moved,
+				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 	} while (class && max_load_move > total_load_moved);
@@ -2314,6 +2338,32 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return total_load_moved > 0;
 }
 
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle,
+		   struct rq_iterator *iterator)
+{
+	struct task_struct *p = iterator->start(iterator->arg);
+	int pinned = 0;
+
+	while (p) {
+		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+			pull_task(busiest, p, this_rq, this_cpu);
+			/*
+			 * Right now, this is only the second place pull_task()
+			 * is called, so we can safely collect pull_task()
+			 * stats here rather than inside pull_task().
+			 */
+			schedstat_inc(sd, lb_gained[idle]);
+
+			return 1;
+		}
+		p = iterator->next(iterator->arg);
+	}
+
+	return 0;
+}
+
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
@@ -2325,12 +2375,9 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	const struct sched_class *class;
-	int this_best_prio = MAX_PRIO;
 
 	for (class = sched_class_highest; class; class = class->next)
-		if (class->load_balance(this_rq, this_cpu, busiest,
-					1, ULONG_MAX, sd, idle, NULL,
-					&this_best_prio))
+		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
 			return 1;
 
 	return 0;
@@ -3267,18 +3314,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 
-/* Avoid "used but not defined" warning on UP */
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int *this_best_prio, struct rq_iterator *iterator)
-{
-	*load_moved = 0;
-
-	return 0;
-}
-
 #endif
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 166ed6db600b..a90d0457d603 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -936,12 +936,11 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_nr_move, unsigned long max_load_move,
+		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
 	struct cfs_rq *busy_cfs_rq;
-	unsigned long load_moved, total_nr_moved = 0, nr_moved;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
 
@@ -969,25 +968,47 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 #else
 # define maxload rem_load_move
 #endif
-		/* pass busy_cfs_rq argument into
+		/*
+		 * pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
-				max_nr_move, maxload, sd, idle, all_pinned,
-				&load_moved, this_best_prio, &cfs_rq_iterator);
-
-		total_nr_moved += nr_moved;
-		max_nr_move -= nr_moved;
-		rem_load_move -= load_moved;
+		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+					       maxload, sd, idle, all_pinned,
+					       this_best_prio,
+					       &cfs_rq_iterator);
 
-		if (max_nr_move <= 0 || rem_load_move <= 0)
+		if (rem_load_move <= 0)
 			break;
 	}
 
 	return max_load_move - rem_load_move;
 }
 
+static int
+move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	struct cfs_rq *busy_cfs_rq;
+	struct rq_iterator cfs_rq_iterator;
+
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+
+	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+		/*
+		 * pass busy_cfs_rq argument into
+		 * load_balance_[start|next]_fair iterators
+		 */
+		cfs_rq_iterator.arg = busy_cfs_rq;
+		if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+				       &cfs_rq_iterator))
+		    return 1;
+	}
+
+	return 0;
+}
+
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
@@ -1064,6 +1085,7 @@ static const struct sched_class fair_sched_class = {
 	.put_prev_task		= put_prev_task_fair,
 
 	.load_balance		= load_balance_fair,
+	.move_one_task		= move_one_task_fair,
 
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 6e2ead41516e..586b06ca30aa 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -39,9 +39,16 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 
 static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, int *this_best_prio)
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return 0;
+}
+
+static int
+move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	return 0;
 }
@@ -70,6 +77,7 @@ const struct sched_class idle_sched_class = {
 	.put_prev_task		= put_prev_task_idle,
 
 	.load_balance		= load_balance_idle,
+	.move_one_task		= move_one_task_idle,
 
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d0097a0634e5..e9395b7119e6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -172,13 +172,11 @@ static struct task_struct *load_balance_next_rt(void *arg)
 
 static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, int *this_best_prio)
+		unsigned long max_load_move,
+		struct sched_domain *sd, enum cpu_idle_type idle,
+		int *all_pinned, int *this_best_prio)
 {
-	int nr_moved;
 	struct rq_iterator rt_rq_iterator;
-	unsigned long load_moved;
 
 	rt_rq_iterator.start = load_balance_start_rt;
 	rt_rq_iterator.next = load_balance_next_rt;
@@ -187,11 +185,22 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	 */
 	rt_rq_iterator.arg = busiest;
 
-	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-			max_load_move, sd, idle, all_pinned, &load_moved,
-			this_best_prio, &rt_rq_iterator);
+	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
+			     idle, all_pinned, this_best_prio, &rt_rq_iterator);
+}
+
+static int
+move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	struct rq_iterator rt_rq_iterator;
+
+	rt_rq_iterator.start = load_balance_start_rt;
+	rt_rq_iterator.next = load_balance_next_rt;
+	rt_rq_iterator.arg = busiest;
 
-	return load_moved;
+	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+				  &rt_rq_iterator);
 }
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
@@ -237,6 +246,7 @@ const struct sched_class rt_sched_class = {
 	.put_prev_task		= put_prev_task_rt,
 
 	.load_balance		= load_balance_rt,
+	.move_one_task		= move_one_task_rt,
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
-- 
cgit v1.2.2


From 681f3e68541d6f03e3e05d21fe15093578b8b539 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Wed, 24 Oct 2007 18:23:51 +0200
Subject: sched: isolate SMP balancing code a bit more

At the moment, a lot of load balancing code that is irrelevant to non
SMP systems gets included during non SMP builds.

This patch addresses this issue and reduces the binary size on non
SMP systems:

   text    data     bss     dec     hex filename
  10983      28    1192   12203    2fab sched.o.before
  10739      28    1192   11959    2eb7 sched.o.after

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  2 ++
 kernel/sched.c          | 17 -----------------
 kernel/sched_fair.c     |  4 ++++
 kernel/sched_idletask.c |  4 ++++
 kernel/sched_rt.c       |  4 ++++
 5 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 639241f4f3d1..24e08d1d900d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -828,6 +828,7 @@ struct sched_class {
 	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
+#ifdef CONFIG_SMP
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
@@ -836,6 +837,7 @@ struct sched_class {
 	int (*move_one_task) (struct rq *this_rq, int this_cpu,
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
+#endif
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 8607795fad69..b4fbbc440453 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -849,23 +849,6 @@ static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator);
-#else
-static inline unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      unsigned long max_load_move, struct sched_domain *sd,
-	      enum cpu_idle_type idle, int *all_pinned,
-	      int *this_best_prio, struct rq_iterator *iterator)
-{
-	return 0;
-}
-
-static inline int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle,
-		   struct rq_iterator *iterator)
-{
-	return 0;
-}
 #endif
 
 #include "sched_stats.h"
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a90d0457d603..9971831b560e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -876,6 +876,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 	}
 }
 
+#ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
  */
@@ -1008,6 +1009,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	return 0;
 }
+#endif
 
 /*
  * scheduler tick hitting a task of our scheduling class:
@@ -1084,8 +1086,10 @@ static const struct sched_class fair_sched_class = {
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 
+#ifdef CONFIG_SMP
 	.load_balance		= load_balance_fair,
 	.move_one_task		= move_one_task_fair,
+#endif
 
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 586b06ca30aa..bf9c25c15b8b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -37,6 +37,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
 
+#ifdef CONFIG_SMP
 static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -52,6 +53,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 {
 	return 0;
 }
+#endif
 
 static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
@@ -76,8 +78,10 @@ const struct sched_class idle_sched_class = {
 	.pick_next_task		= pick_next_task_idle,
 	.put_prev_task		= put_prev_task_idle,
 
+#ifdef CONFIG_SMP
 	.load_balance		= load_balance_idle,
 	.move_one_task		= move_one_task_idle,
+#endif
 
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e9395b7119e6..8abd752a0ebd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -98,6 +98,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	p->se.exec_start = 0;
 }
 
+#ifdef CONFIG_SMP
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
@@ -202,6 +203,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
 				  &rt_rq_iterator);
 }
+#endif
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
@@ -245,8 +247,10 @@ const struct sched_class rt_sched_class = {
 	.pick_next_task		= pick_next_task_rt,
 	.put_prev_task		= put_prev_task_rt,
 
+#ifdef CONFIG_SMP
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
+#endif
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
-- 
cgit v1.2.2


From 8ef93cf11413e3f2dc28bfaf736e1f49981ed700 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 24 Oct 2007 18:23:51 +0200
Subject: sched: mark CONFIG_FAIR_GROUP_SCHED as !EXPERIMENTAL

mark CONFIG_FAIR_GROUP_SCHED as !EXPERIMENTAL. All bugs have been
fixed and it's perfect ;-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index b7dffa837926..8b88d0bedcbd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -322,7 +322,6 @@ config CPUSETS
 config FAIR_GROUP_SCHED
 	bool "Fair group CPU scheduler"
 	default y
-	depends on EXPERIMENTAL
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups.
-- 
cgit v1.2.2