1 files changed, 245 insertions, 24 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ed90be46fb31..afe76ec2e7fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -44,6 +44,7 @@
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
+#include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
@@ -51,6 +52,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
+#include <linux/cpu_acct.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
@@ -153,10 +155,15 @@ struct rt_prio_array {
 #ifdef CONFIG_FAIR_GROUP_SCHED
+#include <linux/cgroup.h>
 struct cfs_rq;
 /* task group related information */
 struct task_group {
+#ifdef CONFIG_FAIR_CGROUP_SCHED
+        struct cgroup_subsys_state css;
+#endif
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
        /* runqueue "owned" by this group on each cpu */
@@ -197,6 +204,9 @@ static inline struct task_group *task_group(struct task_struct *p)
 #ifdef CONFIG_FAIR_USER_SCHED
        tg = p->user->tg;
+#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
+                                struct task_group, css);
 #else
        tg  = &init_task_group;
 #endif
@@ -1875,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
        preempt_enable();
 #endif
        if (current->set_child_tid)
-                put_user(current->pid, current->set_child_tid);
+                put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
@@ -3307,9 +3317,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
        cputime64_t tmp;
+        struct rq *rq = this_rq();
        p->utime = cputime_add(p->utime, cputime);
+        if (p != rq->idle)
+                cpuacct_charge(p, cputime);
        /* Add user time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
        if (TASK_NICE(p) > 0)
@@ -3374,9 +3388,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        else if (softirq_count())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-        else if (p != rq->idle)
+        else if (p != rq->idle) {
                cpustat->system = cputime64_add(cpustat->system, tmp);
-        else if (atomic_read(&rq->nr_iowait) > 0)
+                cpuacct_charge(p, cputime);
+        } else if (atomic_read(&rq->nr_iowait) > 0)
                cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
        else
                cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3412,8 +3427,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
                        cpustat->idle = cputime64_add(cpustat->idle, tmp);
-        } else
+        } else {
                cpustat->steal = cputime64_add(cpustat->steal, tmp);
+                cpuacct_charge(p, -tmp);
+        }
 }
 /*
@@ -3493,7 +3510,7 @@ EXPORT_SYMBOL(sub_preempt_count);
 static noinline void __schedule_bug(struct task_struct *prev)
 {
        printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
-                prev->comm, preempt_count(), prev->pid);
+                prev->comm, preempt_count(), task_pid_nr(prev));
        debug_show_held_locks(prev);
        if (irqs_disabled())
                print_irqtrace_events(prev);
@@ -4159,7 +4176,7 @@ struct task_struct *idle_task(int cpu)
 */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
-        return pid ? find_task_by_pid(pid) : current;
+        return pid ? find_task_by_vpid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
@@ -4462,8 +4479,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
        cpus_allowed = cpuset_cpus_allowed(p);
        cpus_and(new_mask, new_mask, cpus_allowed);
+ again:
        retval = set_cpus_allowed(p, new_mask);
+        if (!retval) {
+                cpus_allowed = cpuset_cpus_allowed(p);
+                if (!cpus_subset(new_mask, cpus_allowed)) {
+                        /*
+                         * We must have raced with a concurrent cpuset
+                         * update. Just reset the cpus_allowed to the
+                         * cpuset's cpus_allowed
+                         */
+                        new_mask = cpus_allowed;
+                        goto again;
+                }
+        }
 out_unlock:
        put_task_struct(p);
        mutex_unlock(&sched_hotcpu_mutex);
@@ -4843,7 +4873,8 @@ static void show_task(struct task_struct *p)
                free = (unsigned long)n - (unsigned long)end_of_stack(p);
        }
 #endif
-        printk(KERN_CONT "%5lu %5d %6d\n", free, p->pid, p->parent->pid);
+        printk(KERN_CONT "%5lu %5d %6d\n", free,
+                task_pid_nr(p), task_pid_nr(p->parent));
        if (state != TASK_RUNNING)
                show_stack(p, NULL);
@@ -5137,8 +5168,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
                /* No more Mr. Nice Guy. */
                if (dest_cpu == NR_CPUS) {
+                        cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
+                        /*
+                         * Try to stay on the same cpuset, where the
+                         * current cpuset may be a subset of all cpus.
+                         * The cpuset_cpus_allowed_locked() variant of
+                         * cpuset_cpus_allowed() will not block.  It must be
+                         * called within calls to cpuset_lock/cpuset_unlock.
+                         */
                        rq = task_rq_lock(p, &flags);
-                        cpus_setall(p->cpus_allowed);
+                        p->cpus_allowed = cpus_allowed;
                        dest_cpu = any_online_cpu(p->cpus_allowed);
                        task_rq_unlock(rq, &flags);
@@ -5150,7 +5189,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
                        if (p->mm && printk_ratelimit())
                                printk(KERN_INFO "process %d (%s) no "
                                       "longer affine to cpu%d\n",
-                                       p->pid, p->comm, dead_cpu);
+                               task_pid_nr(p), p->comm, dead_cpu);
                }
        } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
 }
@@ -5257,7 +5296,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
        struct rq *rq = cpu_rq(dead_cpu);
        /* Must be exiting, otherwise would be on tasklist. */
-        BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
+        BUG_ON(!p->exit_state);
        /* Cannot have done final schedule yet: would have vanished. */
        BUG_ON(p->state == TASK_DEAD);
@@ -5504,6 +5543,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
+                cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
@@ -5517,6 +5557,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
                spin_unlock_irq(&rq->lock);
+                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
@@ -6367,26 +6408,31 @@ error:
        return -ENOMEM;
 #endif
 }
+static cpumask_t *doms_cur;     /* current sched domains */
+static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask_t) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask_t fallback_doms.
+ */
+static cpumask_t fallback_doms;
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
 */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
-        cpumask_t cpu_default_map;
+        ndoms_cur = 1;
-        int err;
+        doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+        if (!doms_cur)
-        /*
+                doms_cur = &fallback_doms;
-         * Setup mask for cpus without special case scheduling requirements.
+        cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
-         * For now this just excludes isolated cpus, but could be used to
-         * exclude other special cases in the future.
-         */
-        cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-        err = build_sched_domains(&cpu_default_map);
        register_sched_domain_sysctl();
+        return build_sched_domains(doms_cur);
-        return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6410,6 +6456,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        arch_destroy_sched_domains(cpu_map);
 }
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks.  This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.)  We should setup one
+ * sched domain for each mask.  CPUs not in any of the cpumasks will
+ * not be load balanced.  If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be kmalloc'd.  This routine takes
+ * ownership of it and will kfree it when done with it.  If the caller
+ * failed the kmalloc call, then it can pass in doms_new == NULL,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms'.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
+{
+        int i, j;
+        if (doms_new == NULL) {
+                ndoms_new = 1;
+                doms_new = &fallback_doms;
+                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+        }
+        /* Destroy deleted domains */
+        for (i = 0; i < ndoms_cur; i++) {
+                for (j = 0; j < ndoms_new; j++) {
+                        if (cpus_equal(doms_cur[i], doms_new[j]))
+                                goto match1;
+                }
+                /* no match - a current sched domain not in new doms_new[] */
+                detach_destroy_domains(doms_cur + i);
+match1:
+                ;
+        }
+        /* Build new domains */
+        for (i = 0; i < ndoms_new; i++) {
+                for (j = 0; j < ndoms_cur; j++) {
+                        if (cpus_equal(doms_new[i], doms_cur[j]))
+                                goto match2;
+                }
+                /* no match - add a new doms_new */
+                build_sched_domains(doms_new + i);
+match2:
+                ;
+        }
+        /* Remember the new sched domains */
+        if (doms_cur != &fallback_doms)
+                kfree(doms_cur);
+        doms_cur = doms_new;
+        ndoms_cur = ndoms_new;
+}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 static int arch_reinit_sched_domains(void)
 {
@@ -6991,3 +7099,116 @@ unsigned long sched_group_shares(struct task_group *tg)
 }
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_CGROUP_SCHED
+/* return corresponding task_group object of a cgroup */
+static inline struct task_group *cgroup_tg(struct cgroup *cont)
+{
+        return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
+                                         struct task_group, css);
+}
+static struct cgroup_subsys_state *
+cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        struct task_group *tg;
+        if (!cont->parent) {
+                /* This is early initialization for the top cgroup */
+                init_task_group.css.cgroup = cont;
+                return &init_task_group.css;
+        }
+        /* we support only 1-level deep hierarchical scheduler atm */
+        if (cont->parent->parent)
+                return ERR_PTR(-EINVAL);
+        tg = sched_create_group();
+        if (IS_ERR(tg))
+                return ERR_PTR(-ENOMEM);
+        /* Bind the cgroup to task_group object we just created */
+        tg->css.cgroup = cont;
+        return &tg->css;
+}
+static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
+                                        struct cgroup *cont)
+{
+        struct task_group *tg = cgroup_tg(cont);
+        sched_destroy_group(tg);
+}
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
+                             struct cgroup *cont, struct task_struct *tsk)
+{
+        /* We don't support RT-tasks being in separate groups */
+        if (tsk->sched_class != &fair_sched_class)
+                return -EINVAL;
+        return 0;
+}
+static void
+cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+                        struct cgroup *old_cont, struct task_struct *tsk)
+{
+        sched_move_task(tsk);
+}
+static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
+                                struct file *file, const char __user *userbuf,
+                                size_t nbytes, loff_t *ppos)
+{
+        unsigned long shareval;
+        struct task_group *tg = cgroup_tg(cont);
+        char buffer[2*sizeof(unsigned long) + 1];
+        int rc;
+        if (nbytes > 2*sizeof(unsigned long))   /* safety check */
+                return -E2BIG;
+        if (copy_from_user(buffer, userbuf, nbytes))
+                return -EFAULT;
+        buffer[nbytes] = 0;     /* nul-terminate */
+        shareval = simple_strtoul(buffer, NULL, 10);
+        rc = sched_group_set_shares(tg, shareval);
+        return (rc < 0 ? rc : nbytes);
+}
+static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
+{
+        struct task_group *tg = cgroup_tg(cont);
+        return (u64) tg->shares;
+}
+static struct cftype cpu_shares = {
+        .name = "shares",
+        .read_uint = cpu_shares_read_uint,
+        .write = cpu_shares_write,
+};
+static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        return cgroup_add_file(cont, ss, &cpu_shares);
+}
+struct cgroup_subsys cpu_cgroup_subsys = {
+        .name           = "cpu",
+        .create         = cpu_cgroup_create,
+        .destroy        = cpu_cgroup_destroy,
+        .can_attach     = cpu_cgroup_can_attach,
+        .attach         = cpu_cgroup_attach,
+        .populate       = cpu_cgroup_populate,
+        .subsys_id      = cpu_cgroup_subsys_id,
+        .early_init     = 1,
+};
+#endif  /* CONFIG_FAIR_CGROUP_SCHED */

diff --git a/kernel/sched.c b/kernel/sched.c index ed90be46fb31..afe76ec2e7fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -44,6 +44,7 @@
44	#include <linux/vmalloc.h>	44	#include <linux/vmalloc.h>
45	#include <linux/blkdev.h>	45	#include <linux/blkdev.h>
46	#include <linux/delay.h>	46	#include <linux/delay.h>
		47	#include <linux/pid_namespace.h>
47	#include <linux/smp.h>	48	#include <linux/smp.h>
48	#include <linux/threads.h>	49	#include <linux/threads.h>
49	#include <linux/timer.h>	50	#include <linux/timer.h>
@@ -51,6 +52,7 @@
51	#include <linux/cpu.h>	52	#include <linux/cpu.h>
52	#include <linux/cpuset.h>	53	#include <linux/cpuset.h>
53	#include <linux/percpu.h>	54	#include <linux/percpu.h>
		55	#include <linux/cpu_acct.h>
54	#include <linux/kthread.h>	56	#include <linux/kthread.h>
55	#include <linux/seq_file.h>	57	#include <linux/seq_file.h>
56	#include <linux/sysctl.h>	58	#include <linux/sysctl.h>
@@ -153,10 +155,15 @@ struct rt_prio_array {
153		155
154	#ifdef CONFIG_FAIR_GROUP_SCHED	156	#ifdef CONFIG_FAIR_GROUP_SCHED
155		157
		158	#include <linux/cgroup.h>
		159
156	struct cfs_rq;	160	struct cfs_rq;
157		161
158	/* task group related information */	162	/* task group related information */
159	struct task_group {	163	struct task_group {
		164	#ifdef CONFIG_FAIR_CGROUP_SCHED
		165	struct cgroup_subsys_state css;
		166	#endif
160	/* schedulable entities of this group on each cpu */	167	/* schedulable entities of this group on each cpu */
161	struct sched_entity **se;	168	struct sched_entity **se;
162	/* runqueue "owned" by this group on each cpu */	169	/* runqueue "owned" by this group on each cpu */
@@ -197,6 +204,9 @@ static inline struct task_group task_group(struct task_struct p)
197		204
198	#ifdef CONFIG_FAIR_USER_SCHED	205	#ifdef CONFIG_FAIR_USER_SCHED
199	tg = p->user->tg;	206	tg = p->user->tg;
		207	#elif defined(CONFIG_FAIR_CGROUP_SCHED)
		208	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
		209	struct task_group, css);
200	#else	210	#else
201	tg = &init_task_group;	211	tg = &init_task_group;
202	#endif	212	#endif
@@ -1875,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1875	preempt_enable();	1885	preempt_enable();
1876	#endif	1886	#endif
1877	if (current->set_child_tid)	1887	if (current->set_child_tid)
1878	put_user(current->pid, current->set_child_tid);	1888	put_user(task_pid_vnr(current), current->set_child_tid);
1879	}	1889	}
1880		1890
1881	/*	1891	/*
@@ -3307,9 +3317,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3307	{	3317	{
3308	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3318	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3309	cputime64_t tmp;	3319	cputime64_t tmp;
		3320	struct rq *rq = this_rq();
3310		3321
3311	p->utime = cputime_add(p->utime, cputime);	3322	p->utime = cputime_add(p->utime, cputime);
3312		3323
		3324	if (p != rq->idle)
		3325	cpuacct_charge(p, cputime);
		3326
3313	/* Add user time to cpustat. */	3327	/* Add user time to cpustat. */
3314	tmp = cputime_to_cputime64(cputime);	3328	tmp = cputime_to_cputime64(cputime);
3315	if (TASK_NICE(p) > 0)	3329	if (TASK_NICE(p) > 0)
@@ -3374,9 +3388,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3374	cpustat->irq = cputime64_add(cpustat->irq, tmp);	3388	cpustat->irq = cputime64_add(cpustat->irq, tmp);
3375	else if (softirq_count())	3389	else if (softirq_count())
3376	cpustat->softirq = cputime64_add(cpustat->softirq, tmp);	3390	cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3377	else if (p != rq->idle)	3391	else if (p != rq->idle) {
3378	cpustat->system = cputime64_add(cpustat->system, tmp);	3392	cpustat->system = cputime64_add(cpustat->system, tmp);
3379	else if (atomic_read(&rq->nr_iowait) > 0)	3393	cpuacct_charge(p, cputime);
		3394	} else if (atomic_read(&rq->nr_iowait) > 0)
3380	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);	3395	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3381	else	3396	else
3382	cpustat->idle = cputime64_add(cpustat->idle, tmp);	3397	cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3412,8 +3427,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3412	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);	3427	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3413	else	3428	else
3414	cpustat->idle = cputime64_add(cpustat->idle, tmp);	3429	cpustat->idle = cputime64_add(cpustat->idle, tmp);
3415	} else	3430	} else {
3416	cpustat->steal = cputime64_add(cpustat->steal, tmp);	3431	cpustat->steal = cputime64_add(cpustat->steal, tmp);
		3432	cpuacct_charge(p, -tmp);
		3433	}
3417	}	3434	}
3418		3435
3419	/*	3436	/*
@@ -3493,7 +3510,7 @@ EXPORT_SYMBOL(sub_preempt_count);
3493	static noinline void __schedule_bug(struct task_struct *prev)	3510	static noinline void __schedule_bug(struct task_struct *prev)
3494	{	3511	{
3495	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",	3512	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3496	prev->comm, preempt_count(), prev->pid);	3513	prev->comm, preempt_count(), task_pid_nr(prev));
3497	debug_show_held_locks(prev);	3514	debug_show_held_locks(prev);
3498	if (irqs_disabled())	3515	if (irqs_disabled())
3499	print_irqtrace_events(prev);	3516	print_irqtrace_events(prev);
@@ -4159,7 +4176,7 @@ struct task_struct *idle_task(int cpu)
4159	*/	4176	*/
4160	static struct task_struct *find_process_by_pid(pid_t pid)	4177	static struct task_struct *find_process_by_pid(pid_t pid)
4161	{	4178	{
4162	return pid ? find_task_by_pid(pid) : current;	4179	return pid ? find_task_by_vpid(pid) : current;
4163	}	4180	}
4164		4181
4165	/* Actually do priority change: must hold rq lock. */	4182	/* Actually do priority change: must hold rq lock. */
@@ -4462,8 +4479,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4462		4479
4463	cpus_allowed = cpuset_cpus_allowed(p);	4480	cpus_allowed = cpuset_cpus_allowed(p);
4464	cpus_and(new_mask, new_mask, cpus_allowed);	4481	cpus_and(new_mask, new_mask, cpus_allowed);
		4482	again:
4465	retval = set_cpus_allowed(p, new_mask);	4483	retval = set_cpus_allowed(p, new_mask);
4466		4484
		4485	if (!retval) {
		4486	cpus_allowed = cpuset_cpus_allowed(p);
		4487	if (!cpus_subset(new_mask, cpus_allowed)) {
		4488	/*
		4489	* We must have raced with a concurrent cpuset
		4490	* update. Just reset the cpus_allowed to the
		4491	* cpuset's cpus_allowed
		4492	*/
		4493	new_mask = cpus_allowed;
		4494	goto again;
		4495	}
		4496	}
4467	out_unlock:	4497	out_unlock:
4468	put_task_struct(p);	4498	put_task_struct(p);
4469	mutex_unlock(&sched_hotcpu_mutex);	4499	mutex_unlock(&sched_hotcpu_mutex);
@@ -4843,7 +4873,8 @@ static void show_task(struct task_struct *p)
4843	free = (unsigned long)n - (unsigned long)end_of_stack(p);	4873	free = (unsigned long)n - (unsigned long)end_of_stack(p);
4844	}	4874	}
4845	#endif	4875	#endif
4846	printk(KERN_CONT "%5lu %5d %6d\n", free, p->pid, p->parent->pid);	4876	printk(KERN_CONT "%5lu %5d %6d\n", free,
		4877	task_pid_nr(p), task_pid_nr(p->parent));
4847		4878
4848	if (state != TASK_RUNNING)	4879	if (state != TASK_RUNNING)
4849	show_stack(p, NULL);	4880	show_stack(p, NULL);
@@ -5137,8 +5168,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5137		5168
5138	/* No more Mr. Nice Guy. */	5169	/* No more Mr. Nice Guy. */
5139	if (dest_cpu == NR_CPUS) {	5170	if (dest_cpu == NR_CPUS) {
		5171	cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
		5172	/*
		5173	* Try to stay on the same cpuset, where the
		5174	* current cpuset may be a subset of all cpus.
		5175	* The cpuset_cpus_allowed_locked() variant of
		5176	* cpuset_cpus_allowed() will not block. It must be
		5177	* called within calls to cpuset_lock/cpuset_unlock.
		5178	*/
5140	rq = task_rq_lock(p, &flags);	5179	rq = task_rq_lock(p, &flags);
5141	cpus_setall(p->cpus_allowed);	5180	p->cpus_allowed = cpus_allowed;
5142	dest_cpu = any_online_cpu(p->cpus_allowed);	5181	dest_cpu = any_online_cpu(p->cpus_allowed);
5143	task_rq_unlock(rq, &flags);	5182	task_rq_unlock(rq, &flags);
5144		5183
@@ -5150,7 +5189,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5150	if (p->mm && printk_ratelimit())	5189	if (p->mm && printk_ratelimit())
5151	printk(KERN_INFO "process %d (%s) no "	5190	printk(KERN_INFO "process %d (%s) no "
5152	"longer affine to cpu%d\n",	5191	"longer affine to cpu%d\n",
5153	p->pid, p->comm, dead_cpu);	5192	task_pid_nr(p), p->comm, dead_cpu);
5154	}	5193	}
5155	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));	5194	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5156	}	5195	}
@@ -5257,7 +5296,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5257	struct rq *rq = cpu_rq(dead_cpu);	5296	struct rq *rq = cpu_rq(dead_cpu);
5258		5297
5259	/* Must be exiting, otherwise would be on tasklist. */	5298	/* Must be exiting, otherwise would be on tasklist. */
5260	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);	5299	BUG_ON(!p->exit_state);
5261		5300
5262	/* Cannot have done final schedule yet: would have vanished. */	5301	/* Cannot have done final schedule yet: would have vanished. */
5263	BUG_ON(p->state == TASK_DEAD);	5302	BUG_ON(p->state == TASK_DEAD);
@@ -5504,6 +5543,7 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5504		5543
5505	case CPU_DEAD:	5544	case CPU_DEAD:
5506	case CPU_DEAD_FROZEN:	5545	case CPU_DEAD_FROZEN:
		5546	cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5507	migrate_live_tasks(cpu);	5547	migrate_live_tasks(cpu);
5508	rq = cpu_rq(cpu);	5548	rq = cpu_rq(cpu);
5509	kthread_stop(rq->migration_thread);	5549	kthread_stop(rq->migration_thread);
@@ -5517,6 +5557,7 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5517	rq->idle->sched_class = &idle_sched_class;	5557	rq->idle->sched_class = &idle_sched_class;
5518	migrate_dead_tasks(cpu);	5558	migrate_dead_tasks(cpu);
5519	spin_unlock_irq(&rq->lock);	5559	spin_unlock_irq(&rq->lock);
		5560	cpuset_unlock();
5520	migrate_nr_uninterruptible(rq);	5561	migrate_nr_uninterruptible(rq);
5521	BUG_ON(rq->nr_running != 0);	5562	BUG_ON(rq->nr_running != 0);
5522		5563
@@ -6367,26 +6408,31 @@ error:
6367	return -ENOMEM;	6408	return -ENOMEM;
6368	#endif	6409	#endif
6369	}	6410	}
		6411
		6412	static cpumask_t doms_cur; / current sched domains */
		6413	static int ndoms_cur; /* number of sched domains in 'doms_cur' */
		6414
		6415	/*
		6416	* Special case: If a kmalloc of a doms_cur partition (array of
		6417	* cpumask_t) fails, then fallback to a single sched domain,
		6418	* as determined by the single cpumask_t fallback_doms.
		6419	*/
		6420	static cpumask_t fallback_doms;
		6421
6370	/*	6422	/*
6371	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	6423	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
		6424	* For now this just excludes isolated cpus, but could be used to
		6425	* exclude other special cases in the future.
6372	*/	6426	*/
6373	static int arch_init_sched_domains(const cpumask_t *cpu_map)	6427	static int arch_init_sched_domains(const cpumask_t *cpu_map)
6374	{	6428	{
6375	cpumask_t cpu_default_map;	6429	ndoms_cur = 1;
6376	int err;	6430	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6377		6431	if (!doms_cur)
6378	/*	6432	doms_cur = &fallback_doms;
6379	* Setup mask for cpus without special case scheduling requirements.	6433	cpus_andnot(doms_cur, cpu_map, cpu_isolated_map);
6380	* For now this just excludes isolated cpus, but could be used to
6381	* exclude other special cases in the future.
6382	*/
6383	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6384
6385	err = build_sched_domains(&cpu_default_map);
6386
6387	register_sched_domain_sysctl();	6434	register_sched_domain_sysctl();
6388		6435	return build_sched_domains(doms_cur);
6389	return err;
6390	}	6436	}
6391		6437
6392	static void arch_destroy_sched_domains(const cpumask_t *cpu_map)	6438	static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6410,6 +6456,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6410	arch_destroy_sched_domains(cpu_map);	6456	arch_destroy_sched_domains(cpu_map);
6411	}	6457	}
6412		6458
		6459	/*
		6460	* Partition sched domains as specified by the 'ndoms_new'
		6461	* cpumasks in the array doms_new[] of cpumasks. This compares
		6462	* doms_new[] to the current sched domain partitioning, doms_cur[].
		6463	* It destroys each deleted domain and builds each new domain.
		6464	*
		6465	* 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
		6466	* The masks don't intersect (don't overlap.) We should setup one
		6467	* sched domain for each mask. CPUs not in any of the cpumasks will
		6468	* not be load balanced. If the same cpumask appears both in the
		6469	* current 'doms_cur' domains and in the new 'doms_new', we can leave
		6470	* it as it is.
		6471	*
		6472	* The passed in 'doms_new' should be kmalloc'd. This routine takes
		6473	* ownership of it and will kfree it when done with it. If the caller
		6474	* failed the kmalloc call, then it can pass in doms_new == NULL,
		6475	* and partition_sched_domains() will fallback to the single partition
		6476	* 'fallback_doms'.
		6477	*
		6478	* Call with hotplug lock held
		6479	*/
		6480	void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
		6481	{
		6482	int i, j;
		6483
		6484	if (doms_new == NULL) {
		6485	ndoms_new = 1;
		6486	doms_new = &fallback_doms;
		6487	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
		6488	}
		6489
		6490	/* Destroy deleted domains */
		6491	for (i = 0; i < ndoms_cur; i++) {
		6492	for (j = 0; j < ndoms_new; j++) {
		6493	if (cpus_equal(doms_cur[i], doms_new[j]))
		6494	goto match1;
		6495	}
		6496	/* no match - a current sched domain not in new doms_new[] */
		6497	detach_destroy_domains(doms_cur + i);
		6498	match1:
		6499	;
		6500	}
		6501
		6502	/* Build new domains */
		6503	for (i = 0; i < ndoms_new; i++) {
		6504	for (j = 0; j < ndoms_cur; j++) {
		6505	if (cpus_equal(doms_new[i], doms_cur[j]))
		6506	goto match2;
		6507	}
		6508	/* no match - add a new doms_new */
		6509	build_sched_domains(doms_new + i);
		6510	match2:
		6511	;
		6512	}
		6513
		6514	/* Remember the new sched domains */
		6515	if (doms_cur != &fallback_doms)
		6516	kfree(doms_cur);
		6517	doms_cur = doms_new;
		6518	ndoms_cur = ndoms_new;
		6519	}
		6520
6413	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	6521	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
6414	static int arch_reinit_sched_domains(void)	6522	static int arch_reinit_sched_domains(void)
6415	{	6523	{
@@ -6991,3 +7099,116 @@ unsigned long sched_group_shares(struct task_group *tg)
6991	}	7099	}
6992		7100
6993	#endif /* CONFIG_FAIR_GROUP_SCHED */	7101	#endif /* CONFIG_FAIR_GROUP_SCHED */
		7102
		7103	#ifdef CONFIG_FAIR_CGROUP_SCHED
		7104
		7105	/* return corresponding task_group object of a cgroup */
		7106	static inline struct task_group cgroup_tg(struct cgroup cont)
		7107	{
		7108	return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
		7109	struct task_group, css);
		7110	}
		7111
		7112	static struct cgroup_subsys_state *
		7113	cpu_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
		7114	{
		7115	struct task_group *tg;
		7116
		7117	if (!cont->parent) {
		7118	/* This is early initialization for the top cgroup */
		7119	init_task_group.css.cgroup = cont;
		7120	return &init_task_group.css;
		7121	}
		7122
		7123	/* we support only 1-level deep hierarchical scheduler atm */
		7124	if (cont->parent->parent)
		7125	return ERR_PTR(-EINVAL);
		7126
		7127	tg = sched_create_group();
		7128	if (IS_ERR(tg))
		7129	return ERR_PTR(-ENOMEM);
		7130
		7131	/* Bind the cgroup to task_group object we just created */
		7132	tg->css.cgroup = cont;
		7133
		7134	return &tg->css;
		7135	}
		7136
		7137	static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
		7138	struct cgroup *cont)
		7139	{
		7140	struct task_group *tg = cgroup_tg(cont);
		7141
		7142	sched_destroy_group(tg);
		7143	}
		7144
		7145	static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
		7146	struct cgroup cont, struct task_struct tsk)
		7147	{
		7148	/* We don't support RT-tasks being in separate groups */
		7149	if (tsk->sched_class != &fair_sched_class)
		7150	return -EINVAL;
		7151
		7152	return 0;
		7153	}
		7154
		7155	static void
		7156	cpu_cgroup_attach(struct cgroup_subsys ss, struct cgroup cont,
		7157	struct cgroup old_cont, struct task_struct tsk)
		7158	{
		7159	sched_move_task(tsk);
		7160	}
		7161
		7162	static ssize_t cpu_shares_write(struct cgroup cont, struct cftype cftype,
		7163	struct file file, const char __user userbuf,
		7164	size_t nbytes, loff_t *ppos)
		7165	{
		7166	unsigned long shareval;
		7167	struct task_group *tg = cgroup_tg(cont);
		7168	char buffer[2*sizeof(unsigned long) + 1];
		7169	int rc;
		7170
		7171	if (nbytes > 2sizeof(unsigned long)) / safety check */
		7172	return -E2BIG;
		7173
		7174	if (copy_from_user(buffer, userbuf, nbytes))
		7175	return -EFAULT;
		7176
		7177	buffer[nbytes] = 0; /* nul-terminate */
		7178	shareval = simple_strtoul(buffer, NULL, 10);
		7179
		7180	rc = sched_group_set_shares(tg, shareval);
		7181
		7182	return (rc < 0 ? rc : nbytes);
		7183	}
		7184
		7185	static u64 cpu_shares_read_uint(struct cgroup cont, struct cftype cft)
		7186	{
		7187	struct task_group *tg = cgroup_tg(cont);
		7188
		7189	return (u64) tg->shares;
		7190	}
		7191
		7192	static struct cftype cpu_shares = {
		7193	.name = "shares",
		7194	.read_uint = cpu_shares_read_uint,
		7195	.write = cpu_shares_write,
		7196	};
		7197
		7198	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)
		7199	{
		7200	return cgroup_add_file(cont, ss, &cpu_shares);
		7201	}
		7202
		7203	struct cgroup_subsys cpu_cgroup_subsys = {
		7204	.name = "cpu",
		7205	.create = cpu_cgroup_create,
		7206	.destroy = cpu_cgroup_destroy,
		7207	.can_attach = cpu_cgroup_can_attach,
		7208	.attach = cpu_cgroup_attach,
		7209	.populate = cpu_cgroup_populate,
		7210	.subsys_id = cpu_cgroup_subsys_id,
		7211	.early_init = 1,
		7212	};
		7213
		7214	#endif /* CONFIG_FAIR_CGROUP_SCHED */