aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorDavid Vrabel <david.vrabel@csr.com>2009-01-02 08:17:13 -0500
committerDavid Vrabel <david.vrabel@csr.com>2009-01-02 08:17:13 -0500
commitb21a207141d83a06abc5f492b80204602e02ca44 (patch)
treef0152cde543008c72d7eb5c12c18095ad92785e6 /kernel/sched.c
parent3af373021fa32f8f787bfbdcc1a9277a287bde4e (diff)
parentb58602a4bac012b5f4fc12fe6b46ab237b610d5d (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-upstream
Conflicts: drivers/uwb/wlp/eda.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c414
1 files changed, 228 insertions, 186 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb5c3dc..fff1c4a20b65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -203,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
203 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
204 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207} 212}
208 213
209static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -261,6 +266,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 266 struct cgroup_subsys_state css;
262#endif 267#endif
263 268
269#ifdef CONFIG_USER_SCHED
270 uid_t uid;
271#endif
272
264#ifdef CONFIG_FAIR_GROUP_SCHED 273#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 274 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 275 struct sched_entity **se;
@@ -286,6 +295,12 @@ struct task_group {
286 295
287#ifdef CONFIG_USER_SCHED 296#ifdef CONFIG_USER_SCHED
288 297
298/* Helper function to pass uid information to create_sched_user() */
299void set_tg_uid(struct user_struct *user)
300{
301 user->tg->uid = user->uid;
302}
303
289/* 304/*
290 * Root task group. 305 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 306 * Every UID task group (including init_task_group aka UID-0) will
@@ -345,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
345 struct task_group *tg; 360 struct task_group *tg;
346 361
347#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
349#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css); 368 struct task_group, css);
@@ -586,6 +603,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 603#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 604 /* latency stats */
588 struct sched_info rq_sched_info; 605 struct sched_info rq_sched_info;
606 unsigned long long rq_cpu_time;
607 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 608
590 /* sys_sched_yield() stats */ 609 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 610 unsigned int yld_exp_empty;
@@ -703,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
703 722
704#undef SCHED_FEAT 723#undef SCHED_FEAT
705 724
706static int sched_feat_open(struct inode *inode, struct file *filp) 725static int sched_feat_show(struct seq_file *m, void *v)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{ 726{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 727 int i;
720 728
721 for (i = 0; sched_feat_names[i]; i++) { 729 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 730 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 731 seq_puts(m, "NO_");
724 } 732 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 733 }
734 seq_puts(m, "\n");
736 735
737 r += sprintf(buf + r, "\n"); 736 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 737}
746 738
747static ssize_t 739static ssize_t
@@ -786,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 778 return cnt;
787} 779}
788 780
781static int sched_feat_open(struct inode *inode, struct file *filp)
782{
783 return single_open(filp, sched_feat_show, NULL);
784}
785
789static struct file_operations sched_feat_fops = { 786static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 787 .open = sched_feat_open,
791 .read = sched_feat_read, 788 .write = sched_feat_write,
792 .write = sched_feat_write, 789 .read = seq_read,
790 .llseek = seq_lseek,
791 .release = single_release,
793}; 792};
794 793
795static __init int sched_init_debug(void) 794static __init int sched_init_debug(void)
@@ -1139,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
1139 1138
1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1139 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1141 rq->hrtick_timer.function = hrtick; 1140 rq->hrtick_timer.function = hrtick;
1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1143} 1141}
1144#else /* CONFIG_SCHED_HRTICK */ 1142#else /* CONFIG_SCHED_HRTICK */
1145static inline void hrtick_clear(struct rq *rq) 1143static inline void hrtick_clear(struct rq *rq)
@@ -1474,27 +1472,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1472update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1474{
1477 int boost = 0;
1478 unsigned long shares; 1475 unsigned long shares;
1479 unsigned long rq_weight; 1476 unsigned long rq_weight;
1480 1477
1481 if (!tg->se[cpu]) 1478 if (!tg->se[cpu])
1482 return; 1479 return;
1483 1480
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1481 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1482
1499 /* 1483 /*
1500 * \Sum shares * rq_weight 1484 * \Sum shares * rq_weight
@@ -1502,7 +1486,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1486 * \Sum rq_weight
1503 * 1487 *
1504 */ 1488 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1489 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1490 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1491
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1492 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1495,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1495 unsigned long flags;
1512 1496
1513 spin_lock_irqsave(&rq->lock, flags); 1497 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1498 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1499
1520 __set_se_shares(tg->se[cpu], shares); 1500 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1501 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1509,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1509 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1510static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1511{
1532 unsigned long rq_weight = 0; 1512 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1513 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1514 struct sched_domain *sd = data;
1535 int i; 1515 int i;
1536 1516
1537 for_each_cpu_mask(i, sd->span) { 1517 for_each_cpu_mask(i, sd->span) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1518 /*
1519 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to
1521 * run here it will not get delayed by group starvation.
1522 */
1523 weight = tg->cfs_rq[i]->load.weight;
1524 if (!weight)
1525 weight = NICE_0_LOAD;
1526
1527 tg->cfs_rq[i]->rq_weight = weight;
1528 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1529 shares += tg->cfs_rq[i]->shares;
1540 } 1530 }
1541 1531
@@ -1545,9 +1535,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1536 shares = tg->shares;
1547 1537
1548 if (!rq_weight)
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span) 1538 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1539 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1540
@@ -1612,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1612 1599
1613#endif 1600#endif
1614 1601
1602/*
1603 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1604 */
1605static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1606 __releases(this_rq->lock)
1607 __acquires(busiest->lock)
1608 __acquires(this_rq->lock)
1609{
1610 int ret = 0;
1611
1612 if (unlikely(!irqs_disabled())) {
1613 /* printk() doesn't work good under rq->lock */
1614 spin_unlock(&this_rq->lock);
1615 BUG_ON(1);
1616 }
1617 if (unlikely(!spin_trylock(&busiest->lock))) {
1618 if (busiest < this_rq) {
1619 spin_unlock(&this_rq->lock);
1620 spin_lock(&busiest->lock);
1621 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1622 ret = 1;
1623 } else
1624 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1625 }
1626 return ret;
1627}
1628
1629static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1630 __releases(busiest->lock)
1631{
1632 spin_unlock(&busiest->lock);
1633 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1634}
1615#endif 1635#endif
1616 1636
1617#ifdef CONFIG_FAIR_GROUP_SCHED 1637#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1845,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1845 1865
1846 clock_offset = old_rq->clock - new_rq->clock; 1866 clock_offset = old_rq->clock - new_rq->clock;
1847 1867
1868 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1869
1848#ifdef CONFIG_SCHEDSTATS 1870#ifdef CONFIG_SCHEDSTATS
1849 if (p->se.wait_start) 1871 if (p->se.wait_start)
1850 p->se.wait_start -= clock_offset; 1872 p->se.wait_start -= clock_offset;
@@ -2254,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2254 2276
2255 smp_wmb(); 2277 smp_wmb();
2256 rq = task_rq_lock(p, &flags); 2278 rq = task_rq_lock(p, &flags);
2279 update_rq_clock(rq);
2257 old_state = p->state; 2280 old_state = p->state;
2258 if (!(old_state & state)) 2281 if (!(old_state & state))
2259 goto out; 2282 goto out;
@@ -2311,12 +2334,11 @@ out_activate:
2311 schedstat_inc(p, se.nr_wakeups_local); 2334 schedstat_inc(p, se.nr_wakeups_local);
2312 else 2335 else
2313 schedstat_inc(p, se.nr_wakeups_remote); 2336 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1); 2337 activate_task(rq, p, 1);
2316 success = 1; 2338 success = 1;
2317 2339
2318out_running: 2340out_running:
2319 trace_sched_wakeup(rq, p); 2341 trace_sched_wakeup(rq, p, success);
2320 check_preempt_curr(rq, p, sync); 2342 check_preempt_curr(rq, p, sync);
2321 2343
2322 p->state = TASK_RUNNING; 2344 p->state = TASK_RUNNING;
@@ -2449,7 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2449 p->sched_class->task_new(rq, p); 2471 p->sched_class->task_new(rq, p);
2450 inc_nr_running(rq); 2472 inc_nr_running(rq);
2451 } 2473 }
2452 trace_sched_wakeup_new(rq, p); 2474 trace_sched_wakeup_new(rq, p, 1);
2453 check_preempt_curr(rq, p, 0); 2475 check_preempt_curr(rq, p, 0);
2454#ifdef CONFIG_SMP 2476#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up) 2477 if (p->sched_class->task_wake_up)
@@ -2812,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2812} 2834}
2813 2835
2814/* 2836/*
2815 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2816 */
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825 /* printk() doesn't work good under rq->lock */
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848/*
2849 * If dest_cpu is allowed for this process, migrate the task to it. 2837 * If dest_cpu is allowed for this process, migrate the task to it.
2850 * This is accomplished by forcing the cpu_allowed mask to only 2838 * This is accomplished by forcing the cpu_allowed mask to only
2851 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2839 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2862,7 +2850,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2862 || unlikely(!cpu_active(dest_cpu))) 2850 || unlikely(!cpu_active(dest_cpu)))
2863 goto out; 2851 goto out;
2864 2852
2865 trace_sched_migrate_task(rq, p, dest_cpu);
2866 /* force the process onto the specified CPU */ 2853 /* force the process onto the specified CPU */
2867 if (migrate_task(p, dest_cpu, &req)) { 2854 if (migrate_task(p, dest_cpu, &req)) {
2868 /* Need to wait for migration thread (might exit: take ref). */ 2855 /* Need to wait for migration thread (might exit: take ref). */
@@ -3707,7 +3694,7 @@ out_balanced:
3707static void idle_balance(int this_cpu, struct rq *this_rq) 3694static void idle_balance(int this_cpu, struct rq *this_rq)
3708{ 3695{
3709 struct sched_domain *sd; 3696 struct sched_domain *sd;
3710 int pulled_task = -1; 3697 int pulled_task = 0;
3711 unsigned long next_balance = jiffies + HZ; 3698 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3699 cpumask_t tmpmask;
3713 3700
@@ -4203,7 +4190,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4203 4190
4204 if (p == rq->idle) { 4191 if (p == rq->idle) {
4205 p->stime = cputime_add(p->stime, steal); 4192 p->stime = cputime_add(p->stime, steal);
4206 account_group_system_time(p, steal);
4207 if (atomic_read(&rq->nr_iowait) > 0) 4193 if (atomic_read(&rq->nr_iowait) > 0)
4208 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4194 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4209 else 4195 else
@@ -4339,7 +4325,7 @@ void __kprobes sub_preempt_count(int val)
4339 /* 4325 /*
4340 * Underflow? 4326 * Underflow?
4341 */ 4327 */
4342 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4328 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4343 return; 4329 return;
4344 /* 4330 /*
4345 * Is the spinlock portion underflowing? 4331 * Is the spinlock portion underflowing?
@@ -5134,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5134 set_load_weight(p); 5120 set_load_weight(p);
5135} 5121}
5136 5122
5123/*
5124 * check the target process has a UID that matches the current process's
5125 */
5126static bool check_same_owner(struct task_struct *p)
5127{
5128 const struct cred *cred = current_cred(), *pcred;
5129 bool match;
5130
5131 rcu_read_lock();
5132 pcred = __task_cred(p);
5133 match = (cred->euid == pcred->euid ||
5134 cred->euid == pcred->uid);
5135 rcu_read_unlock();
5136 return match;
5137}
5138
5137static int __sched_setscheduler(struct task_struct *p, int policy, 5139static int __sched_setscheduler(struct task_struct *p, int policy,
5138 struct sched_param *param, bool user) 5140 struct sched_param *param, bool user)
5139{ 5141{
@@ -5193,8 +5195,7 @@ recheck:
5193 return -EPERM; 5195 return -EPERM;
5194 5196
5195 /* can't change other user's priorities */ 5197 /* can't change other user's priorities */
5196 if ((current->euid != p->euid) && 5198 if (!check_same_owner(p))
5197 (current->euid != p->uid))
5198 return -EPERM; 5199 return -EPERM;
5199 } 5200 }
5200 5201
@@ -5426,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5426 read_unlock(&tasklist_lock); 5427 read_unlock(&tasklist_lock);
5427 5428
5428 retval = -EPERM; 5429 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) && 5430 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5430 !capable(CAP_SYS_NICE))
5431 goto out_unlock; 5431 goto out_unlock;
5432 5432
5433 retval = security_task_setscheduler(p, 0, NULL); 5433 retval = security_task_setscheduler(p, 0, NULL);
@@ -5896,6 +5896,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5896 * The idle tasks have their own, simple scheduling class: 5896 * The idle tasks have their own, simple scheduling class:
5897 */ 5897 */
5898 idle->sched_class = &idle_sched_class; 5898 idle->sched_class = &idle_sched_class;
5899 ftrace_graph_init_task(idle);
5899} 5900}
5900 5901
5901/* 5902/*
@@ -6126,7 +6127,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6127
6127/* 6128/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6129 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6130 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6132{
@@ -6587,7 +6587,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6587 req = list_entry(rq->migration_queue.next, 6587 req = list_entry(rq->migration_queue.next,
6588 struct migration_req, list); 6588 struct migration_req, list);
6589 list_del_init(&req->list); 6589 list_del_init(&req->list);
6590 spin_unlock_irq(&rq->lock);
6590 complete(&req->done); 6591 complete(&req->done);
6592 spin_lock_irq(&rq->lock);
6591 } 6593 }
6592 spin_unlock_irq(&rq->lock); 6594 spin_unlock_irq(&rq->lock);
6593 break; 6595 break;
@@ -6636,28 +6638,6 @@ early_initcall(migration_init);
6636 6638
6637#ifdef CONFIG_SCHED_DEBUG 6639#ifdef CONFIG_SCHED_DEBUG
6638 6640
6639static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6640{
6641 switch (lvl) {
6642 case SD_LV_NONE:
6643 return "NONE";
6644 case SD_LV_SIBLING:
6645 return "SIBLING";
6646 case SD_LV_MC:
6647 return "MC";
6648 case SD_LV_CPU:
6649 return "CPU";
6650 case SD_LV_NODE:
6651 return "NODE";
6652 case SD_LV_ALLNODES:
6653 return "ALLNODES";
6654 case SD_LV_MAX:
6655 return "MAX";
6656
6657 }
6658 return "MAX";
6659}
6660
6661static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6641static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6662 cpumask_t *groupmask) 6642 cpumask_t *groupmask)
6663{ 6643{
@@ -6677,8 +6657,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6677 return -1; 6657 return -1;
6678 } 6658 }
6679 6659
6680 printk(KERN_CONT "span %s level %s\n", 6660 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6681 str, sd_level_to_string(sd->level));
6682 6661
6683 if (!cpu_isset(cpu, sd->span)) { 6662 if (!cpu_isset(cpu, sd->span)) {
6684 printk(KERN_ERR "ERROR: domain->span does not contain " 6663 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6814,6 +6793,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6814 SD_BALANCE_EXEC | 6793 SD_BALANCE_EXEC |
6815 SD_SHARE_CPUPOWER | 6794 SD_SHARE_CPUPOWER |
6816 SD_SHARE_PKG_RESOURCES); 6795 SD_SHARE_PKG_RESOURCES);
6796 if (nr_node_ids == 1)
6797 pflags &= ~SD_SERIALIZE;
6817 } 6798 }
6818 if (~cflags & pflags) 6799 if (~cflags & pflags)
6819 return 0; 6800 return 0;
@@ -7334,13 +7315,21 @@ struct allmasks {
7334}; 7315};
7335 7316
7336#if NR_CPUS > 128 7317#if NR_CPUS > 128
7337#define SCHED_CPUMASK_ALLOC 1 7318#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7338#define SCHED_CPUMASK_FREE(v) kfree(v) 7319static inline void sched_cpumask_alloc(struct allmasks **masks)
7339#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7320{
7321 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7322}
7323static inline void sched_cpumask_free(struct allmasks *masks)
7324{
7325 kfree(masks);
7326}
7340#else 7327#else
7341#define SCHED_CPUMASK_ALLOC 0 7328#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7342#define SCHED_CPUMASK_FREE(v) 7329static inline void sched_cpumask_alloc(struct allmasks **masks)
7343#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7330{ }
7331static inline void sched_cpumask_free(struct allmasks *masks)
7332{ }
7344#endif 7333#endif
7345 7334
7346#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7335#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7416,9 +7405,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7416 return -ENOMEM; 7405 return -ENOMEM;
7417 } 7406 }
7418 7407
7419#if SCHED_CPUMASK_ALLOC
7420 /* get space for all scratch cpumask variables */ 7408 /* get space for all scratch cpumask variables */
7421 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7409 sched_cpumask_alloc(&allmasks);
7422 if (!allmasks) { 7410 if (!allmasks) {
7423 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7411 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7424 kfree(rd); 7412 kfree(rd);
@@ -7427,7 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7427#endif 7415#endif
7428 return -ENOMEM; 7416 return -ENOMEM;
7429 } 7417 }
7430#endif 7418
7431 tmpmask = (cpumask_t *)allmasks; 7419 tmpmask = (cpumask_t *)allmasks;
7432 7420
7433 7421
@@ -7681,13 +7669,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7681 cpu_attach_domain(sd, rd, i); 7669 cpu_attach_domain(sd, rd, i);
7682 } 7670 }
7683 7671
7684 SCHED_CPUMASK_FREE((void *)allmasks); 7672 sched_cpumask_free(allmasks);
7685 return 0; 7673 return 0;
7686 7674
7687#ifdef CONFIG_NUMA 7675#ifdef CONFIG_NUMA
7688error: 7676error:
7689 free_sched_groups(cpu_map, tmpmask); 7677 free_sched_groups(cpu_map, tmpmask);
7690 SCHED_CPUMASK_FREE((void *)allmasks); 7678 sched_cpumask_free(allmasks);
7691 kfree(rd); 7679 kfree(rd);
7692 return -ENOMEM; 7680 return -ENOMEM;
7693#endif 7681#endif
@@ -7710,8 +7698,14 @@ static struct sched_domain_attr *dattr_cur;
7710 */ 7698 */
7711static cpumask_t fallback_doms; 7699static cpumask_t fallback_doms;
7712 7700
7713void __attribute__((weak)) arch_update_cpu_topology(void) 7701/*
7702 * arch_update_cpu_topology lets virtualized architectures update the
7703 * cpu core maps. It is supposed to return 1 if the topology changed
7704 * or 0 if it stayed the same.
7705 */
7706int __attribute__((weak)) arch_update_cpu_topology(void)
7714{ 7707{
7708 return 0;
7715} 7709}
7716 7710
7717/* 7711/*
@@ -7751,8 +7745,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7751 cpumask_t tmpmask; 7745 cpumask_t tmpmask;
7752 int i; 7746 int i;
7753 7747
7754 unregister_sched_domain_sysctl();
7755
7756 for_each_cpu_mask_nr(i, *cpu_map) 7748 for_each_cpu_mask_nr(i, *cpu_map)
7757 cpu_attach_domain(NULL, &def_root_domain, i); 7749 cpu_attach_domain(NULL, &def_root_domain, i);
7758 synchronize_sched(); 7750 synchronize_sched();
@@ -7805,17 +7797,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7805 struct sched_domain_attr *dattr_new) 7797 struct sched_domain_attr *dattr_new)
7806{ 7798{
7807 int i, j, n; 7799 int i, j, n;
7800 int new_topology;
7808 7801
7809 mutex_lock(&sched_domains_mutex); 7802 mutex_lock(&sched_domains_mutex);
7810 7803
7811 /* always unregister in case we don't destroy any domains */ 7804 /* always unregister in case we don't destroy any domains */
7812 unregister_sched_domain_sysctl(); 7805 unregister_sched_domain_sysctl();
7813 7806
7807 /* Let architecture update cpu core mappings. */
7808 new_topology = arch_update_cpu_topology();
7809
7814 n = doms_new ? ndoms_new : 0; 7810 n = doms_new ? ndoms_new : 0;
7815 7811
7816 /* Destroy deleted domains */ 7812 /* Destroy deleted domains */
7817 for (i = 0; i < ndoms_cur; i++) { 7813 for (i = 0; i < ndoms_cur; i++) {
7818 for (j = 0; j < n; j++) { 7814 for (j = 0; j < n && !new_topology; j++) {
7819 if (cpus_equal(doms_cur[i], doms_new[j]) 7815 if (cpus_equal(doms_cur[i], doms_new[j])
7820 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7816 && dattrs_equal(dattr_cur, i, dattr_new, j))
7821 goto match1; 7817 goto match1;
@@ -7830,12 +7826,12 @@ match1:
7830 ndoms_cur = 0; 7826 ndoms_cur = 0;
7831 doms_new = &fallback_doms; 7827 doms_new = &fallback_doms;
7832 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7828 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7833 dattr_new = NULL; 7829 WARN_ON_ONCE(dattr_new);
7834 } 7830 }
7835 7831
7836 /* Build new domains */ 7832 /* Build new domains */
7837 for (i = 0; i < ndoms_new; i++) { 7833 for (i = 0; i < ndoms_new; i++) {
7838 for (j = 0; j < ndoms_cur; j++) { 7834 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7839 if (cpus_equal(doms_new[i], doms_cur[j]) 7835 if (cpus_equal(doms_new[i], doms_cur[j])
7840 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7836 && dattrs_equal(dattr_new, i, dattr_cur, j))
7841 goto match2; 7837 goto match2;
@@ -8490,7 +8486,7 @@ static
8490int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8486int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8491{ 8487{
8492 struct cfs_rq *cfs_rq; 8488 struct cfs_rq *cfs_rq;
8493 struct sched_entity *se, *parent_se; 8489 struct sched_entity *se;
8494 struct rq *rq; 8490 struct rq *rq;
8495 int i; 8491 int i;
8496 8492
@@ -8506,18 +8502,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8506 for_each_possible_cpu(i) { 8502 for_each_possible_cpu(i) {
8507 rq = cpu_rq(i); 8503 rq = cpu_rq(i);
8508 8504
8509 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8505 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8510 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8506 GFP_KERNEL, cpu_to_node(i));
8511 if (!cfs_rq) 8507 if (!cfs_rq)
8512 goto err; 8508 goto err;
8513 8509
8514 se = kmalloc_node(sizeof(struct sched_entity), 8510 se = kzalloc_node(sizeof(struct sched_entity),
8515 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8511 GFP_KERNEL, cpu_to_node(i));
8516 if (!se) 8512 if (!se)
8517 goto err; 8513 goto err;
8518 8514
8519 parent_se = parent ? parent->se[i] : NULL; 8515 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8520 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8521 } 8516 }
8522 8517
8523 return 1; 8518 return 1;
@@ -8578,7 +8573,7 @@ static
8578int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8573int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8579{ 8574{
8580 struct rt_rq *rt_rq; 8575 struct rt_rq *rt_rq;
8581 struct sched_rt_entity *rt_se, *parent_se; 8576 struct sched_rt_entity *rt_se;
8582 struct rq *rq; 8577 struct rq *rq;
8583 int i; 8578 int i;
8584 8579
@@ -8595,18 +8590,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8595 for_each_possible_cpu(i) { 8590 for_each_possible_cpu(i) {
8596 rq = cpu_rq(i); 8591 rq = cpu_rq(i);
8597 8592
8598 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8593 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8599 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8594 GFP_KERNEL, cpu_to_node(i));
8600 if (!rt_rq) 8595 if (!rt_rq)
8601 goto err; 8596 goto err;
8602 8597
8603 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8598 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8604 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8599 GFP_KERNEL, cpu_to_node(i));
8605 if (!rt_se) 8600 if (!rt_se)
8606 goto err; 8601 goto err;
8607 8602
8608 parent_se = parent ? parent->rt_se[i] : NULL; 8603 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8609 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8610 } 8604 }
8611 8605
8612 return 1; 8606 return 1;
@@ -9249,11 +9243,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9249 * (balbir@in.ibm.com). 9243 * (balbir@in.ibm.com).
9250 */ 9244 */
9251 9245
9252/* track cpu usage of a group of tasks */ 9246/* track cpu usage of a group of tasks and its child groups */
9253struct cpuacct { 9247struct cpuacct {
9254 struct cgroup_subsys_state css; 9248 struct cgroup_subsys_state css;
9255 /* cpuusage holds pointer to a u64-type object on every cpu */ 9249 /* cpuusage holds pointer to a u64-type object on every cpu */
9256 u64 *cpuusage; 9250 u64 *cpuusage;
9251 struct cpuacct *parent;
9257}; 9252};
9258 9253
9259struct cgroup_subsys cpuacct_subsys; 9254struct cgroup_subsys cpuacct_subsys;
@@ -9287,6 +9282,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9287 return ERR_PTR(-ENOMEM); 9282 return ERR_PTR(-ENOMEM);
9288 } 9283 }
9289 9284
9285 if (cgrp->parent)
9286 ca->parent = cgroup_ca(cgrp->parent);
9287
9290 return &ca->css; 9288 return &ca->css;
9291} 9289}
9292 9290
@@ -9300,6 +9298,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9300 kfree(ca); 9298 kfree(ca);
9301} 9299}
9302 9300
9301static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9302{
9303 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9304 u64 data;
9305
9306#ifndef CONFIG_64BIT
9307 /*
9308 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9309 */
9310 spin_lock_irq(&cpu_rq(cpu)->lock);
9311 data = *cpuusage;
9312 spin_unlock_irq(&cpu_rq(cpu)->lock);
9313#else
9314 data = *cpuusage;
9315#endif
9316
9317 return data;
9318}
9319
9320static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9321{
9322 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9323
9324#ifndef CONFIG_64BIT
9325 /*
9326 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9327 */
9328 spin_lock_irq(&cpu_rq(cpu)->lock);
9329 *cpuusage = val;
9330 spin_unlock_irq(&cpu_rq(cpu)->lock);
9331#else
9332 *cpuusage = val;
9333#endif
9334}
9335
9303/* return total cpu usage (in nanoseconds) of a group */ 9336/* return total cpu usage (in nanoseconds) of a group */
9304static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9337static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9305{ 9338{
@@ -9307,17 +9340,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307 u64 totalcpuusage = 0; 9340 u64 totalcpuusage = 0;
9308 int i; 9341 int i;
9309 9342
9310 for_each_possible_cpu(i) { 9343 for_each_present_cpu(i)
9311 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9344 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9312
9313 /*
9314 * Take rq->lock to make 64-bit addition safe on 32-bit
9315 * platforms.
9316 */
9317 spin_lock_irq(&cpu_rq(i)->lock);
9318 totalcpuusage += *cpuusage;
9319 spin_unlock_irq(&cpu_rq(i)->lock);
9320 }
9321 9345
9322 return totalcpuusage; 9346 return totalcpuusage;
9323} 9347}
@@ -9334,23 +9358,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9334 goto out; 9358 goto out;
9335 } 9359 }
9336 9360
9337 for_each_possible_cpu(i) { 9361 for_each_present_cpu(i)
9338 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9362 cpuacct_cpuusage_write(ca, i, 0);
9339 9363
9340 spin_lock_irq(&cpu_rq(i)->lock);
9341 *cpuusage = 0;
9342 spin_unlock_irq(&cpu_rq(i)->lock);
9343 }
9344out: 9364out:
9345 return err; 9365 return err;
9346} 9366}
9347 9367
9368static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9369 struct seq_file *m)
9370{
9371 struct cpuacct *ca = cgroup_ca(cgroup);
9372 u64 percpu;
9373 int i;
9374
9375 for_each_present_cpu(i) {
9376 percpu = cpuacct_cpuusage_read(ca, i);
9377 seq_printf(m, "%llu ", (unsigned long long) percpu);
9378 }
9379 seq_printf(m, "\n");
9380 return 0;
9381}
9382
9348static struct cftype files[] = { 9383static struct cftype files[] = {
9349 { 9384 {
9350 .name = "usage", 9385 .name = "usage",
9351 .read_u64 = cpuusage_read, 9386 .read_u64 = cpuusage_read,
9352 .write_u64 = cpuusage_write, 9387 .write_u64 = cpuusage_write,
9353 }, 9388 },
9389 {
9390 .name = "usage_percpu",
9391 .read_seq_string = cpuacct_percpu_seq_read,
9392 },
9393
9354}; 9394};
9355 9395
9356static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9396static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9366,14 +9406,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9366static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9406static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9367{ 9407{
9368 struct cpuacct *ca; 9408 struct cpuacct *ca;
9409 int cpu;
9369 9410
9370 if (!cpuacct_subsys.active) 9411 if (!cpuacct_subsys.active)
9371 return; 9412 return;
9372 9413
9414 cpu = task_cpu(tsk);
9373 ca = task_ca(tsk); 9415 ca = task_ca(tsk);
9374 if (ca) {
9375 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9376 9416
9417 for (; ca; ca = ca->parent) {
9418 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9377 *cpuusage += cputime; 9419 *cpuusage += cputime;
9378 } 9420 }
9379} 9421}