aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c269
1 files changed, 245 insertions, 24 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ed90be46fb31..afe76ec2e7fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -44,6 +44,7 @@
44#include <linux/vmalloc.h> 44#include <linux/vmalloc.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/delay.h> 46#include <linux/delay.h>
47#include <linux/pid_namespace.h>
47#include <linux/smp.h> 48#include <linux/smp.h>
48#include <linux/threads.h> 49#include <linux/threads.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
@@ -51,6 +52,7 @@
51#include <linux/cpu.h> 52#include <linux/cpu.h>
52#include <linux/cpuset.h> 53#include <linux/cpuset.h>
53#include <linux/percpu.h> 54#include <linux/percpu.h>
55#include <linux/cpu_acct.h>
54#include <linux/kthread.h> 56#include <linux/kthread.h>
55#include <linux/seq_file.h> 57#include <linux/seq_file.h>
56#include <linux/sysctl.h> 58#include <linux/sysctl.h>
@@ -153,10 +155,15 @@ struct rt_prio_array {
153 155
154#ifdef CONFIG_FAIR_GROUP_SCHED 156#ifdef CONFIG_FAIR_GROUP_SCHED
155 157
158#include <linux/cgroup.h>
159
156struct cfs_rq; 160struct cfs_rq;
157 161
158/* task group related information */ 162/* task group related information */
159struct task_group { 163struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED
165 struct cgroup_subsys_state css;
166#endif
160 /* schedulable entities of this group on each cpu */ 167 /* schedulable entities of this group on each cpu */
161 struct sched_entity **se; 168 struct sched_entity **se;
162 /* runqueue "owned" by this group on each cpu */ 169 /* runqueue "owned" by this group on each cpu */
@@ -197,6 +204,9 @@ static inline struct task_group *task_group(struct task_struct *p)
197 204
198#ifdef CONFIG_FAIR_USER_SCHED 205#ifdef CONFIG_FAIR_USER_SCHED
199 tg = p->user->tg; 206 tg = p->user->tg;
207#elif defined(CONFIG_FAIR_CGROUP_SCHED)
208 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
209 struct task_group, css);
200#else 210#else
201 tg = &init_task_group; 211 tg = &init_task_group;
202#endif 212#endif
@@ -1875,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1875 preempt_enable(); 1885 preempt_enable();
1876#endif 1886#endif
1877 if (current->set_child_tid) 1887 if (current->set_child_tid)
1878 put_user(current->pid, current->set_child_tid); 1888 put_user(task_pid_vnr(current), current->set_child_tid);
1879} 1889}
1880 1890
1881/* 1891/*
@@ -3307,9 +3317,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3307{ 3317{
3308 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3318 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3309 cputime64_t tmp; 3319 cputime64_t tmp;
3320 struct rq *rq = this_rq();
3310 3321
3311 p->utime = cputime_add(p->utime, cputime); 3322 p->utime = cputime_add(p->utime, cputime);
3312 3323
3324 if (p != rq->idle)
3325 cpuacct_charge(p, cputime);
3326
3313 /* Add user time to cpustat. */ 3327 /* Add user time to cpustat. */
3314 tmp = cputime_to_cputime64(cputime); 3328 tmp = cputime_to_cputime64(cputime);
3315 if (TASK_NICE(p) > 0) 3329 if (TASK_NICE(p) > 0)
@@ -3374,9 +3388,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3374 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3388 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3375 else if (softirq_count()) 3389 else if (softirq_count())
3376 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3390 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3377 else if (p != rq->idle) 3391 else if (p != rq->idle) {
3378 cpustat->system = cputime64_add(cpustat->system, tmp); 3392 cpustat->system = cputime64_add(cpustat->system, tmp);
3379 else if (atomic_read(&rq->nr_iowait) > 0) 3393 cpuacct_charge(p, cputime);
3394 } else if (atomic_read(&rq->nr_iowait) > 0)
3380 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3395 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3381 else 3396 else
3382 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3397 cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3412,8 +3427,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3412 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3427 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3413 else 3428 else
3414 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3429 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3415 } else 3430 } else {
3416 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3431 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3432 cpuacct_charge(p, -tmp);
3433 }
3417} 3434}
3418 3435
3419/* 3436/*
@@ -3493,7 +3510,7 @@ EXPORT_SYMBOL(sub_preempt_count);
3493static noinline void __schedule_bug(struct task_struct *prev) 3510static noinline void __schedule_bug(struct task_struct *prev)
3494{ 3511{
3495 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", 3512 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3496 prev->comm, preempt_count(), prev->pid); 3513 prev->comm, preempt_count(), task_pid_nr(prev));
3497 debug_show_held_locks(prev); 3514 debug_show_held_locks(prev);
3498 if (irqs_disabled()) 3515 if (irqs_disabled())
3499 print_irqtrace_events(prev); 3516 print_irqtrace_events(prev);
@@ -4159,7 +4176,7 @@ struct task_struct *idle_task(int cpu)
4159 */ 4176 */
4160static struct task_struct *find_process_by_pid(pid_t pid) 4177static struct task_struct *find_process_by_pid(pid_t pid)
4161{ 4178{
4162 return pid ? find_task_by_pid(pid) : current; 4179 return pid ? find_task_by_vpid(pid) : current;
4163} 4180}
4164 4181
4165/* Actually do priority change: must hold rq lock. */ 4182/* Actually do priority change: must hold rq lock. */
@@ -4462,8 +4479,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4462 4479
4463 cpus_allowed = cpuset_cpus_allowed(p); 4480 cpus_allowed = cpuset_cpus_allowed(p);
4464 cpus_and(new_mask, new_mask, cpus_allowed); 4481 cpus_and(new_mask, new_mask, cpus_allowed);
4482 again:
4465 retval = set_cpus_allowed(p, new_mask); 4483 retval = set_cpus_allowed(p, new_mask);
4466 4484
4485 if (!retval) {
4486 cpus_allowed = cpuset_cpus_allowed(p);
4487 if (!cpus_subset(new_mask, cpus_allowed)) {
4488 /*
4489 * We must have raced with a concurrent cpuset
4490 * update. Just reset the cpus_allowed to the
4491 * cpuset's cpus_allowed
4492 */
4493 new_mask = cpus_allowed;
4494 goto again;
4495 }
4496 }
4467out_unlock: 4497out_unlock:
4468 put_task_struct(p); 4498 put_task_struct(p);
4469 mutex_unlock(&sched_hotcpu_mutex); 4499 mutex_unlock(&sched_hotcpu_mutex);
@@ -4843,7 +4873,8 @@ static void show_task(struct task_struct *p)
4843 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4873 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4844 } 4874 }
4845#endif 4875#endif
4846 printk(KERN_CONT "%5lu %5d %6d\n", free, p->pid, p->parent->pid); 4876 printk(KERN_CONT "%5lu %5d %6d\n", free,
4877 task_pid_nr(p), task_pid_nr(p->parent));
4847 4878
4848 if (state != TASK_RUNNING) 4879 if (state != TASK_RUNNING)
4849 show_stack(p, NULL); 4880 show_stack(p, NULL);
@@ -5137,8 +5168,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5137 5168
5138 /* No more Mr. Nice Guy. */ 5169 /* No more Mr. Nice Guy. */
5139 if (dest_cpu == NR_CPUS) { 5170 if (dest_cpu == NR_CPUS) {
5171 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
5172 /*
5173 * Try to stay on the same cpuset, where the
5174 * current cpuset may be a subset of all cpus.
5175 * The cpuset_cpus_allowed_locked() variant of
5176 * cpuset_cpus_allowed() will not block. It must be
5177 * called within calls to cpuset_lock/cpuset_unlock.
5178 */
5140 rq = task_rq_lock(p, &flags); 5179 rq = task_rq_lock(p, &flags);
5141 cpus_setall(p->cpus_allowed); 5180 p->cpus_allowed = cpus_allowed;
5142 dest_cpu = any_online_cpu(p->cpus_allowed); 5181 dest_cpu = any_online_cpu(p->cpus_allowed);
5143 task_rq_unlock(rq, &flags); 5182 task_rq_unlock(rq, &flags);
5144 5183
@@ -5150,7 +5189,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5150 if (p->mm && printk_ratelimit()) 5189 if (p->mm && printk_ratelimit())
5151 printk(KERN_INFO "process %d (%s) no " 5190 printk(KERN_INFO "process %d (%s) no "
5152 "longer affine to cpu%d\n", 5191 "longer affine to cpu%d\n",
5153 p->pid, p->comm, dead_cpu); 5192 task_pid_nr(p), p->comm, dead_cpu);
5154 } 5193 }
5155 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 5194 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5156} 5195}
@@ -5257,7 +5296,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5257 struct rq *rq = cpu_rq(dead_cpu); 5296 struct rq *rq = cpu_rq(dead_cpu);
5258 5297
5259 /* Must be exiting, otherwise would be on tasklist. */ 5298 /* Must be exiting, otherwise would be on tasklist. */
5260 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5299 BUG_ON(!p->exit_state);
5261 5300
5262 /* Cannot have done final schedule yet: would have vanished. */ 5301 /* Cannot have done final schedule yet: would have vanished. */
5263 BUG_ON(p->state == TASK_DEAD); 5302 BUG_ON(p->state == TASK_DEAD);
@@ -5504,6 +5543,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5504 5543
5505 case CPU_DEAD: 5544 case CPU_DEAD:
5506 case CPU_DEAD_FROZEN: 5545 case CPU_DEAD_FROZEN:
5546 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5507 migrate_live_tasks(cpu); 5547 migrate_live_tasks(cpu);
5508 rq = cpu_rq(cpu); 5548 rq = cpu_rq(cpu);
5509 kthread_stop(rq->migration_thread); 5549 kthread_stop(rq->migration_thread);
@@ -5517,6 +5557,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5517 rq->idle->sched_class = &idle_sched_class; 5557 rq->idle->sched_class = &idle_sched_class;
5518 migrate_dead_tasks(cpu); 5558 migrate_dead_tasks(cpu);
5519 spin_unlock_irq(&rq->lock); 5559 spin_unlock_irq(&rq->lock);
5560 cpuset_unlock();
5520 migrate_nr_uninterruptible(rq); 5561 migrate_nr_uninterruptible(rq);
5521 BUG_ON(rq->nr_running != 0); 5562 BUG_ON(rq->nr_running != 0);
5522 5563
@@ -6367,26 +6408,31 @@ error:
6367 return -ENOMEM; 6408 return -ENOMEM;
6368#endif 6409#endif
6369} 6410}
6411
6412static cpumask_t *doms_cur; /* current sched domains */
6413static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6414
6415/*
6416 * Special case: If a kmalloc of a doms_cur partition (array of
6417 * cpumask_t) fails, then fallback to a single sched domain,
6418 * as determined by the single cpumask_t fallback_doms.
6419 */
6420static cpumask_t fallback_doms;
6421
6370/* 6422/*
6371 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6423 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6424 * For now this just excludes isolated cpus, but could be used to
6425 * exclude other special cases in the future.
6372 */ 6426 */
6373static int arch_init_sched_domains(const cpumask_t *cpu_map) 6427static int arch_init_sched_domains(const cpumask_t *cpu_map)
6374{ 6428{
6375 cpumask_t cpu_default_map; 6429 ndoms_cur = 1;
6376 int err; 6430 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6377 6431 if (!doms_cur)
6378 /* 6432 doms_cur = &fallback_doms;
6379 * Setup mask for cpus without special case scheduling requirements. 6433 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6380 * For now this just excludes isolated cpus, but could be used to
6381 * exclude other special cases in the future.
6382 */
6383 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6384
6385 err = build_sched_domains(&cpu_default_map);
6386
6387 register_sched_domain_sysctl(); 6434 register_sched_domain_sysctl();
6388 6435 return build_sched_domains(doms_cur);
6389 return err;
6390} 6436}
6391 6437
6392static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6438static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6410,6 +6456,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6410 arch_destroy_sched_domains(cpu_map); 6456 arch_destroy_sched_domains(cpu_map);
6411} 6457}
6412 6458
6459/*
6460 * Partition sched domains as specified by the 'ndoms_new'
6461 * cpumasks in the array doms_new[] of cpumasks. This compares
6462 * doms_new[] to the current sched domain partitioning, doms_cur[].
6463 * It destroys each deleted domain and builds each new domain.
6464 *
6465 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
6466 * The masks don't intersect (don't overlap.) We should setup one
6467 * sched domain for each mask. CPUs not in any of the cpumasks will
6468 * not be load balanced. If the same cpumask appears both in the
6469 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6470 * it as it is.
6471 *
6472 * The passed in 'doms_new' should be kmalloc'd. This routine takes
6473 * ownership of it and will kfree it when done with it. If the caller
6474 * failed the kmalloc call, then it can pass in doms_new == NULL,
6475 * and partition_sched_domains() will fallback to the single partition
6476 * 'fallback_doms'.
6477 *
6478 * Call with hotplug lock held
6479 */
6480void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6481{
6482 int i, j;
6483
6484 if (doms_new == NULL) {
6485 ndoms_new = 1;
6486 doms_new = &fallback_doms;
6487 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
6488 }
6489
6490 /* Destroy deleted domains */
6491 for (i = 0; i < ndoms_cur; i++) {
6492 for (j = 0; j < ndoms_new; j++) {
6493 if (cpus_equal(doms_cur[i], doms_new[j]))
6494 goto match1;
6495 }
6496 /* no match - a current sched domain not in new doms_new[] */
6497 detach_destroy_domains(doms_cur + i);
6498match1:
6499 ;
6500 }
6501
6502 /* Build new domains */
6503 for (i = 0; i < ndoms_new; i++) {
6504 for (j = 0; j < ndoms_cur; j++) {
6505 if (cpus_equal(doms_new[i], doms_cur[j]))
6506 goto match2;
6507 }
6508 /* no match - add a new doms_new */
6509 build_sched_domains(doms_new + i);
6510match2:
6511 ;
6512 }
6513
6514 /* Remember the new sched domains */
6515 if (doms_cur != &fallback_doms)
6516 kfree(doms_cur);
6517 doms_cur = doms_new;
6518 ndoms_cur = ndoms_new;
6519}
6520
6413#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6521#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6414static int arch_reinit_sched_domains(void) 6522static int arch_reinit_sched_domains(void)
6415{ 6523{
@@ -6991,3 +7099,116 @@ unsigned long sched_group_shares(struct task_group *tg)
6991} 7099}
6992 7100
6993#endif /* CONFIG_FAIR_GROUP_SCHED */ 7101#endif /* CONFIG_FAIR_GROUP_SCHED */
7102
7103#ifdef CONFIG_FAIR_CGROUP_SCHED
7104
7105/* return corresponding task_group object of a cgroup */
7106static inline struct task_group *cgroup_tg(struct cgroup *cont)
7107{
7108 return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
7109 struct task_group, css);
7110}
7111
7112static struct cgroup_subsys_state *
7113cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
7114{
7115 struct task_group *tg;
7116
7117 if (!cont->parent) {
7118 /* This is early initialization for the top cgroup */
7119 init_task_group.css.cgroup = cont;
7120 return &init_task_group.css;
7121 }
7122
7123 /* we support only 1-level deep hierarchical scheduler atm */
7124 if (cont->parent->parent)
7125 return ERR_PTR(-EINVAL);
7126
7127 tg = sched_create_group();
7128 if (IS_ERR(tg))
7129 return ERR_PTR(-ENOMEM);
7130
7131 /* Bind the cgroup to task_group object we just created */
7132 tg->css.cgroup = cont;
7133
7134 return &tg->css;
7135}
7136
7137static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
7138 struct cgroup *cont)
7139{
7140 struct task_group *tg = cgroup_tg(cont);
7141
7142 sched_destroy_group(tg);
7143}
7144
7145static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
7146 struct cgroup *cont, struct task_struct *tsk)
7147{
7148 /* We don't support RT-tasks being in separate groups */
7149 if (tsk->sched_class != &fair_sched_class)
7150 return -EINVAL;
7151
7152 return 0;
7153}
7154
7155static void
7156cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
7157 struct cgroup *old_cont, struct task_struct *tsk)
7158{
7159 sched_move_task(tsk);
7160}
7161
7162static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
7163 struct file *file, const char __user *userbuf,
7164 size_t nbytes, loff_t *ppos)
7165{
7166 unsigned long shareval;
7167 struct task_group *tg = cgroup_tg(cont);
7168 char buffer[2*sizeof(unsigned long) + 1];
7169 int rc;
7170
7171 if (nbytes > 2*sizeof(unsigned long)) /* safety check */
7172 return -E2BIG;
7173
7174 if (copy_from_user(buffer, userbuf, nbytes))
7175 return -EFAULT;
7176
7177 buffer[nbytes] = 0; /* nul-terminate */
7178 shareval = simple_strtoul(buffer, NULL, 10);
7179
7180 rc = sched_group_set_shares(tg, shareval);
7181
7182 return (rc < 0 ? rc : nbytes);
7183}
7184
7185static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
7186{
7187 struct task_group *tg = cgroup_tg(cont);
7188
7189 return (u64) tg->shares;
7190}
7191
7192static struct cftype cpu_shares = {
7193 .name = "shares",
7194 .read_uint = cpu_shares_read_uint,
7195 .write = cpu_shares_write,
7196};
7197
7198static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7199{
7200 return cgroup_add_file(cont, ss, &cpu_shares);
7201}
7202
7203struct cgroup_subsys cpu_cgroup_subsys = {
7204 .name = "cpu",
7205 .create = cpu_cgroup_create,
7206 .destroy = cpu_cgroup_destroy,
7207 .can_attach = cpu_cgroup_can_attach,
7208 .attach = cpu_cgroup_attach,
7209 .populate = cpu_cgroup_populate,
7210 .subsys_id = cpu_cgroup_subsys_id,
7211 .early_init = 1,
7212};
7213
7214#endif /* CONFIG_FAIR_CGROUP_SCHED */