aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-29 03:42:58 -0500
committerIngo Molnar <mingo@elte.hu>2008-12-29 04:37:07 -0500
commit0ce74d9296c971b2355c26984ad0bc538e34dd6c (patch)
tree566d03e2a4f6b42dab9628cd82c93cd61d587467 /kernel/sched.c
parent1cc4fff0b360aeffeedb7d6db5089d88dd861700 (diff)
parent3c92ec8ae91ecf59d88c798301833d7cf83f2179 (diff)
Merge branch 'linus' into timers/hrtimers
Conflicts: sound/drivers/pcsp/pcsp.c Semantic conflict: sound/core/hrtimer.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c407
1 files changed, 225 insertions, 182 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 22c532a6f82c..355eda28720b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -260,6 +266,10 @@ struct task_group {
260 struct cgroup_subsys_state css; 266 struct cgroup_subsys_state css;
261#endif 267#endif
262 268
269#ifdef CONFIG_USER_SCHED
270 uid_t uid;
271#endif
272
263#ifdef CONFIG_FAIR_GROUP_SCHED 273#ifdef CONFIG_FAIR_GROUP_SCHED
264 /* schedulable entities of this group on each cpu */ 274 /* schedulable entities of this group on each cpu */
265 struct sched_entity **se; 275 struct sched_entity **se;
@@ -285,6 +295,12 @@ struct task_group {
285 295
286#ifdef CONFIG_USER_SCHED 296#ifdef CONFIG_USER_SCHED
287 297
298/* Helper function to pass uid information to create_sched_user() */
299void set_tg_uid(struct user_struct *user)
300{
301 user->tg->uid = user->uid;
302}
303
288/* 304/*
289 * Root task group. 305 * Root task group.
290 * Every UID task group (including init_task_group aka UID-0) will 306 * Every UID task group (including init_task_group aka UID-0) will
@@ -344,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
344 struct task_group *tg; 360 struct task_group *tg;
345 361
346#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
347 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
348#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
349 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
350 struct task_group, css); 368 struct task_group, css);
@@ -585,6 +603,8 @@ struct rq {
585#ifdef CONFIG_SCHEDSTATS 603#ifdef CONFIG_SCHEDSTATS
586 /* latency stats */ 604 /* latency stats */
587 struct sched_info rq_sched_info; 605 struct sched_info rq_sched_info;
606 unsigned long long rq_cpu_time;
607 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
588 608
589 /* sys_sched_yield() stats */ 609 /* sys_sched_yield() stats */
590 unsigned int yld_exp_empty; 610 unsigned int yld_exp_empty;
@@ -702,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
702 722
703#undef SCHED_FEAT 723#undef SCHED_FEAT
704 724
705static int sched_feat_open(struct inode *inode, struct file *filp) 725static int sched_feat_show(struct seq_file *m, void *v)
706{
707 filp->private_data = inode->i_private;
708 return 0;
709}
710
711static ssize_t
712sched_feat_read(struct file *filp, char __user *ubuf,
713 size_t cnt, loff_t *ppos)
714{ 726{
715 char *buf;
716 int r = 0;
717 int len = 0;
718 int i; 727 int i;
719 728
720 for (i = 0; sched_feat_names[i]; i++) { 729 for (i = 0; sched_feat_names[i]; i++) {
721 len += strlen(sched_feat_names[i]); 730 if (!(sysctl_sched_features & (1UL << i)))
722 len += 4; 731 seq_puts(m, "NO_");
723 } 732 seq_printf(m, "%s ", sched_feat_names[i]);
724
725 buf = kmalloc(len + 2, GFP_KERNEL);
726 if (!buf)
727 return -ENOMEM;
728
729 for (i = 0; sched_feat_names[i]; i++) {
730 if (sysctl_sched_features & (1UL << i))
731 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
732 else
733 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
734 } 733 }
734 seq_puts(m, "\n");
735 735
736 r += sprintf(buf + r, "\n"); 736 return 0;
737 WARN_ON(r >= len + 2);
738
739 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
740
741 kfree(buf);
742
743 return r;
744} 737}
745 738
746static ssize_t 739static ssize_t
@@ -785,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
785 return cnt; 778 return cnt;
786} 779}
787 780
781static int sched_feat_open(struct inode *inode, struct file *filp)
782{
783 return single_open(filp, sched_feat_show, NULL);
784}
785
788static struct file_operations sched_feat_fops = { 786static struct file_operations sched_feat_fops = {
789 .open = sched_feat_open, 787 .open = sched_feat_open,
790 .read = sched_feat_read, 788 .write = sched_feat_write,
791 .write = sched_feat_write, 789 .read = seq_read,
790 .llseek = seq_lseek,
791 .release = single_release,
792}; 792};
793 793
794static __init int sched_init_debug(void) 794static __init int sched_init_debug(void)
@@ -1472,27 +1472,13 @@ static void
1472update_group_shares_cpu(struct task_group *tg, int cpu, 1472update_group_shares_cpu(struct task_group *tg, int cpu,
1473 unsigned long sd_shares, unsigned long sd_rq_weight) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1474{ 1474{
1475 int boost = 0;
1476 unsigned long shares; 1475 unsigned long shares;
1477 unsigned long rq_weight; 1476 unsigned long rq_weight;
1478 1477
1479 if (!tg->se[cpu]) 1478 if (!tg->se[cpu])
1480 return; 1479 return;
1481 1480
1482 rq_weight = tg->cfs_rq[cpu]->load.weight; 1481 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1483
1484 /*
1485 * If there are currently no tasks on the cpu pretend there is one of
1486 * average load so that when a new task gets to run here it will not
1487 * get delayed by group starvation.
1488 */
1489 if (!rq_weight) {
1490 boost = 1;
1491 rq_weight = NICE_0_LOAD;
1492 }
1493
1494 if (unlikely(rq_weight > sd_rq_weight))
1495 rq_weight = sd_rq_weight;
1496 1482
1497 /* 1483 /*
1498 * \Sum shares * rq_weight 1484 * \Sum shares * rq_weight
@@ -1500,7 +1486,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1500 * \Sum rq_weight 1486 * \Sum rq_weight
1501 * 1487 *
1502 */ 1488 */
1503 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1489 shares = (sd_shares * rq_weight) / sd_rq_weight;
1504 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1490 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1505 1491
1506 if (abs(shares - tg->se[cpu]->load.weight) > 1492 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1509,11 +1495,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1509 unsigned long flags; 1495 unsigned long flags;
1510 1496
1511 spin_lock_irqsave(&rq->lock, flags); 1497 spin_lock_irqsave(&rq->lock, flags);
1512 /* 1498 tg->cfs_rq[cpu]->shares = shares;
1513 * record the actual number of shares, not the boosted amount.
1514 */
1515 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1516 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1517 1499
1518 __set_se_shares(tg->se[cpu], shares); 1500 __set_se_shares(tg->se[cpu], shares);
1519 spin_unlock_irqrestore(&rq->lock, flags); 1501 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1527,13 +1509,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1527 */ 1509 */
1528static int tg_shares_up(struct task_group *tg, void *data) 1510static int tg_shares_up(struct task_group *tg, void *data)
1529{ 1511{
1530 unsigned long rq_weight = 0; 1512 unsigned long weight, rq_weight = 0;
1531 unsigned long shares = 0; 1513 unsigned long shares = 0;
1532 struct sched_domain *sd = data; 1514 struct sched_domain *sd = data;
1533 int i; 1515 int i;
1534 1516
1535 for_each_cpu_mask(i, sd->span) { 1517 for_each_cpu_mask(i, sd->span) {
1536 rq_weight += tg->cfs_rq[i]->load.weight; 1518 /*
1519 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to
1521 * run here it will not get delayed by group starvation.
1522 */
1523 weight = tg->cfs_rq[i]->load.weight;
1524 if (!weight)
1525 weight = NICE_0_LOAD;
1526
1527 tg->cfs_rq[i]->rq_weight = weight;
1528 rq_weight += weight;
1537 shares += tg->cfs_rq[i]->shares; 1529 shares += tg->cfs_rq[i]->shares;
1538 } 1530 }
1539 1531
@@ -1543,9 +1535,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1543 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1544 shares = tg->shares; 1536 shares = tg->shares;
1545 1537
1546 if (!rq_weight)
1547 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1548
1549 for_each_cpu_mask(i, sd->span) 1538 for_each_cpu_mask(i, sd->span)
1550 update_group_shares_cpu(tg, i, shares, rq_weight); 1539 update_group_shares_cpu(tg, i, shares, rq_weight);
1551 1540
@@ -1610,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1599
1611#endif 1600#endif
1612 1601
1602/*
1603 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1604 */
1605static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1606 __releases(this_rq->lock)
1607 __acquires(busiest->lock)
1608 __acquires(this_rq->lock)
1609{
1610 int ret = 0;
1611
1612 if (unlikely(!irqs_disabled())) {
1613 /* printk() doesn't work good under rq->lock */
1614 spin_unlock(&this_rq->lock);
1615 BUG_ON(1);
1616 }
1617 if (unlikely(!spin_trylock(&busiest->lock))) {
1618 if (busiest < this_rq) {
1619 spin_unlock(&this_rq->lock);
1620 spin_lock(&busiest->lock);
1621 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1622 ret = 1;
1623 } else
1624 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1625 }
1626 return ret;
1627}
1628
1629static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1630 __releases(busiest->lock)
1631{
1632 spin_unlock(&busiest->lock);
1633 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1634}
1613#endif 1635#endif
1614 1636
1615#ifdef CONFIG_FAIR_GROUP_SCHED 1637#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1843,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1843 1865
1844 clock_offset = old_rq->clock - new_rq->clock; 1866 clock_offset = old_rq->clock - new_rq->clock;
1845 1867
1868 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1869
1846#ifdef CONFIG_SCHEDSTATS 1870#ifdef CONFIG_SCHEDSTATS
1847 if (p->se.wait_start) 1871 if (p->se.wait_start)
1848 p->se.wait_start -= clock_offset; 1872 p->se.wait_start -= clock_offset;
@@ -2252,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2252 2276
2253 smp_wmb(); 2277 smp_wmb();
2254 rq = task_rq_lock(p, &flags); 2278 rq = task_rq_lock(p, &flags);
2279 update_rq_clock(rq);
2255 old_state = p->state; 2280 old_state = p->state;
2256 if (!(old_state & state)) 2281 if (!(old_state & state))
2257 goto out; 2282 goto out;
@@ -2309,12 +2334,11 @@ out_activate:
2309 schedstat_inc(p, se.nr_wakeups_local); 2334 schedstat_inc(p, se.nr_wakeups_local);
2310 else 2335 else
2311 schedstat_inc(p, se.nr_wakeups_remote); 2336 schedstat_inc(p, se.nr_wakeups_remote);
2312 update_rq_clock(rq);
2313 activate_task(rq, p, 1); 2337 activate_task(rq, p, 1);
2314 success = 1; 2338 success = 1;
2315 2339
2316out_running: 2340out_running:
2317 trace_sched_wakeup(rq, p); 2341 trace_sched_wakeup(rq, p, success);
2318 check_preempt_curr(rq, p, sync); 2342 check_preempt_curr(rq, p, sync);
2319 2343
2320 p->state = TASK_RUNNING; 2344 p->state = TASK_RUNNING;
@@ -2447,7 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2447 p->sched_class->task_new(rq, p); 2471 p->sched_class->task_new(rq, p);
2448 inc_nr_running(rq); 2472 inc_nr_running(rq);
2449 } 2473 }
2450 trace_sched_wakeup_new(rq, p); 2474 trace_sched_wakeup_new(rq, p, 1);
2451 check_preempt_curr(rq, p, 0); 2475 check_preempt_curr(rq, p, 0);
2452#ifdef CONFIG_SMP 2476#ifdef CONFIG_SMP
2453 if (p->sched_class->task_wake_up) 2477 if (p->sched_class->task_wake_up)
@@ -2810,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2810} 2834}
2811 2835
2812/* 2836/*
2813 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2814 */
2815static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2816 __releases(this_rq->lock)
2817 __acquires(busiest->lock)
2818 __acquires(this_rq->lock)
2819{
2820 int ret = 0;
2821
2822 if (unlikely(!irqs_disabled())) {
2823 /* printk() doesn't work good under rq->lock */
2824 spin_unlock(&this_rq->lock);
2825 BUG_ON(1);
2826 }
2827 if (unlikely(!spin_trylock(&busiest->lock))) {
2828 if (busiest < this_rq) {
2829 spin_unlock(&this_rq->lock);
2830 spin_lock(&busiest->lock);
2831 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2832 ret = 1;
2833 } else
2834 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2835 }
2836 return ret;
2837}
2838
2839static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2840 __releases(busiest->lock)
2841{
2842 spin_unlock(&busiest->lock);
2843 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2844}
2845
2846/*
2847 * If dest_cpu is allowed for this process, migrate the task to it. 2837 * If dest_cpu is allowed for this process, migrate the task to it.
2848 * This is accomplished by forcing the cpu_allowed mask to only 2838 * This is accomplished by forcing the cpu_allowed mask to only
2849 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2839 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2860,7 +2850,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2860 || unlikely(!cpu_active(dest_cpu))) 2850 || unlikely(!cpu_active(dest_cpu)))
2861 goto out; 2851 goto out;
2862 2852
2863 trace_sched_migrate_task(rq, p, dest_cpu);
2864 /* force the process onto the specified CPU */ 2853 /* force the process onto the specified CPU */
2865 if (migrate_task(p, dest_cpu, &req)) { 2854 if (migrate_task(p, dest_cpu, &req)) {
2866 /* Need to wait for migration thread (might exit: take ref). */ 2855 /* Need to wait for migration thread (might exit: take ref). */
@@ -3705,7 +3694,7 @@ out_balanced:
3705static void idle_balance(int this_cpu, struct rq *this_rq) 3694static void idle_balance(int this_cpu, struct rq *this_rq)
3706{ 3695{
3707 struct sched_domain *sd; 3696 struct sched_domain *sd;
3708 int pulled_task = -1; 3697 int pulled_task = 0;
3709 unsigned long next_balance = jiffies + HZ; 3698 unsigned long next_balance = jiffies + HZ;
3710 cpumask_t tmpmask; 3699 cpumask_t tmpmask;
3711 3700
@@ -5132,6 +5121,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5132 set_load_weight(p); 5121 set_load_weight(p);
5133} 5122}
5134 5123
5124/*
5125 * check the target process has a UID that matches the current process's
5126 */
5127static bool check_same_owner(struct task_struct *p)
5128{
5129 const struct cred *cred = current_cred(), *pcred;
5130 bool match;
5131
5132 rcu_read_lock();
5133 pcred = __task_cred(p);
5134 match = (cred->euid == pcred->euid ||
5135 cred->euid == pcred->uid);
5136 rcu_read_unlock();
5137 return match;
5138}
5139
5135static int __sched_setscheduler(struct task_struct *p, int policy, 5140static int __sched_setscheduler(struct task_struct *p, int policy,
5136 struct sched_param *param, bool user) 5141 struct sched_param *param, bool user)
5137{ 5142{
@@ -5191,8 +5196,7 @@ recheck:
5191 return -EPERM; 5196 return -EPERM;
5192 5197
5193 /* can't change other user's priorities */ 5198 /* can't change other user's priorities */
5194 if ((current->euid != p->euid) && 5199 if (!check_same_owner(p))
5195 (current->euid != p->uid))
5196 return -EPERM; 5200 return -EPERM;
5197 } 5201 }
5198 5202
@@ -5424,8 +5428,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5424 read_unlock(&tasklist_lock); 5428 read_unlock(&tasklist_lock);
5425 5429
5426 retval = -EPERM; 5430 retval = -EPERM;
5427 if ((current->euid != p->euid) && (current->euid != p->uid) && 5431 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5428 !capable(CAP_SYS_NICE))
5429 goto out_unlock; 5432 goto out_unlock;
5430 5433
5431 retval = security_task_setscheduler(p, 0, NULL); 5434 retval = security_task_setscheduler(p, 0, NULL);
@@ -5894,6 +5897,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5894 * The idle tasks have their own, simple scheduling class: 5897 * The idle tasks have their own, simple scheduling class:
5895 */ 5898 */
5896 idle->sched_class = &idle_sched_class; 5899 idle->sched_class = &idle_sched_class;
5900 ftrace_graph_init_task(idle);
5897} 5901}
5898 5902
5899/* 5903/*
@@ -6124,7 +6128,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6124 6128
6125/* 6129/*
6126 * Figure out where task on dead CPU should go, use force if necessary. 6130 * Figure out where task on dead CPU should go, use force if necessary.
6127 * NOTE: interrupts should be disabled by the caller
6128 */ 6131 */
6129static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6132static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6130{ 6133{
@@ -6636,28 +6639,6 @@ early_initcall(migration_init);
6636 6639
6637#ifdef CONFIG_SCHED_DEBUG 6640#ifdef CONFIG_SCHED_DEBUG
6638 6641
6639static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6640{
6641 switch (lvl) {
6642 case SD_LV_NONE:
6643 return "NONE";
6644 case SD_LV_SIBLING:
6645 return "SIBLING";
6646 case SD_LV_MC:
6647 return "MC";
6648 case SD_LV_CPU:
6649 return "CPU";
6650 case SD_LV_NODE:
6651 return "NODE";
6652 case SD_LV_ALLNODES:
6653 return "ALLNODES";
6654 case SD_LV_MAX:
6655 return "MAX";
6656
6657 }
6658 return "MAX";
6659}
6660
6661static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6642static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6662 cpumask_t *groupmask) 6643 cpumask_t *groupmask)
6663{ 6644{
@@ -6677,8 +6658,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6677 return -1; 6658 return -1;
6678 } 6659 }
6679 6660
6680 printk(KERN_CONT "span %s level %s\n", 6661 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6681 str, sd_level_to_string(sd->level));
6682 6662
6683 if (!cpu_isset(cpu, sd->span)) { 6663 if (!cpu_isset(cpu, sd->span)) {
6684 printk(KERN_ERR "ERROR: domain->span does not contain " 6664 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6814,6 +6794,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6814 SD_BALANCE_EXEC | 6794 SD_BALANCE_EXEC |
6815 SD_SHARE_CPUPOWER | 6795 SD_SHARE_CPUPOWER |
6816 SD_SHARE_PKG_RESOURCES); 6796 SD_SHARE_PKG_RESOURCES);
6797 if (nr_node_ids == 1)
6798 pflags &= ~SD_SERIALIZE;
6817 } 6799 }
6818 if (~cflags & pflags) 6800 if (~cflags & pflags)
6819 return 0; 6801 return 0;
@@ -7334,13 +7316,21 @@ struct allmasks {
7334}; 7316};
7335 7317
7336#if NR_CPUS > 128 7318#if NR_CPUS > 128
7337#define SCHED_CPUMASK_ALLOC 1 7319#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7338#define SCHED_CPUMASK_FREE(v) kfree(v) 7320static inline void sched_cpumask_alloc(struct allmasks **masks)
7339#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7321{
7322 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7323}
7324static inline void sched_cpumask_free(struct allmasks *masks)
7325{
7326 kfree(masks);
7327}
7340#else 7328#else
7341#define SCHED_CPUMASK_ALLOC 0 7329#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7342#define SCHED_CPUMASK_FREE(v) 7330static inline void sched_cpumask_alloc(struct allmasks **masks)
7343#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7331{ }
7332static inline void sched_cpumask_free(struct allmasks *masks)
7333{ }
7344#endif 7334#endif
7345 7335
7346#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7336#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7416,9 +7406,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7416 return -ENOMEM; 7406 return -ENOMEM;
7417 } 7407 }
7418 7408
7419#if SCHED_CPUMASK_ALLOC
7420 /* get space for all scratch cpumask variables */ 7409 /* get space for all scratch cpumask variables */
7421 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7410 sched_cpumask_alloc(&allmasks);
7422 if (!allmasks) { 7411 if (!allmasks) {
7423 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7412 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7424 kfree(rd); 7413 kfree(rd);
@@ -7427,7 +7416,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7427#endif 7416#endif
7428 return -ENOMEM; 7417 return -ENOMEM;
7429 } 7418 }
7430#endif 7419
7431 tmpmask = (cpumask_t *)allmasks; 7420 tmpmask = (cpumask_t *)allmasks;
7432 7421
7433 7422
@@ -7681,13 +7670,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7681 cpu_attach_domain(sd, rd, i); 7670 cpu_attach_domain(sd, rd, i);
7682 } 7671 }
7683 7672
7684 SCHED_CPUMASK_FREE((void *)allmasks); 7673 sched_cpumask_free(allmasks);
7685 return 0; 7674 return 0;
7686 7675
7687#ifdef CONFIG_NUMA 7676#ifdef CONFIG_NUMA
7688error: 7677error:
7689 free_sched_groups(cpu_map, tmpmask); 7678 free_sched_groups(cpu_map, tmpmask);
7690 SCHED_CPUMASK_FREE((void *)allmasks); 7679 sched_cpumask_free(allmasks);
7691 kfree(rd); 7680 kfree(rd);
7692 return -ENOMEM; 7681 return -ENOMEM;
7693#endif 7682#endif
@@ -7710,8 +7699,14 @@ static struct sched_domain_attr *dattr_cur;
7710 */ 7699 */
7711static cpumask_t fallback_doms; 7700static cpumask_t fallback_doms;
7712 7701
7713void __attribute__((weak)) arch_update_cpu_topology(void) 7702/*
7703 * arch_update_cpu_topology lets virtualized architectures update the
7704 * cpu core maps. It is supposed to return 1 if the topology changed
7705 * or 0 if it stayed the same.
7706 */
7707int __attribute__((weak)) arch_update_cpu_topology(void)
7714{ 7708{
7709 return 0;
7715} 7710}
7716 7711
7717/* 7712/*
@@ -7751,8 +7746,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7751 cpumask_t tmpmask; 7746 cpumask_t tmpmask;
7752 int i; 7747 int i;
7753 7748
7754 unregister_sched_domain_sysctl();
7755
7756 for_each_cpu_mask_nr(i, *cpu_map) 7749 for_each_cpu_mask_nr(i, *cpu_map)
7757 cpu_attach_domain(NULL, &def_root_domain, i); 7750 cpu_attach_domain(NULL, &def_root_domain, i);
7758 synchronize_sched(); 7751 synchronize_sched();
@@ -7805,17 +7798,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7805 struct sched_domain_attr *dattr_new) 7798 struct sched_domain_attr *dattr_new)
7806{ 7799{
7807 int i, j, n; 7800 int i, j, n;
7801 int new_topology;
7808 7802
7809 mutex_lock(&sched_domains_mutex); 7803 mutex_lock(&sched_domains_mutex);
7810 7804
7811 /* always unregister in case we don't destroy any domains */ 7805 /* always unregister in case we don't destroy any domains */
7812 unregister_sched_domain_sysctl(); 7806 unregister_sched_domain_sysctl();
7813 7807
7808 /* Let architecture update cpu core mappings. */
7809 new_topology = arch_update_cpu_topology();
7810
7814 n = doms_new ? ndoms_new : 0; 7811 n = doms_new ? ndoms_new : 0;
7815 7812
7816 /* Destroy deleted domains */ 7813 /* Destroy deleted domains */
7817 for (i = 0; i < ndoms_cur; i++) { 7814 for (i = 0; i < ndoms_cur; i++) {
7818 for (j = 0; j < n; j++) { 7815 for (j = 0; j < n && !new_topology; j++) {
7819 if (cpus_equal(doms_cur[i], doms_new[j]) 7816 if (cpus_equal(doms_cur[i], doms_new[j])
7820 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7817 && dattrs_equal(dattr_cur, i, dattr_new, j))
7821 goto match1; 7818 goto match1;
@@ -7830,12 +7827,12 @@ match1:
7830 ndoms_cur = 0; 7827 ndoms_cur = 0;
7831 doms_new = &fallback_doms; 7828 doms_new = &fallback_doms;
7832 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7829 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7833 dattr_new = NULL; 7830 WARN_ON_ONCE(dattr_new);
7834 } 7831 }
7835 7832
7836 /* Build new domains */ 7833 /* Build new domains */
7837 for (i = 0; i < ndoms_new; i++) { 7834 for (i = 0; i < ndoms_new; i++) {
7838 for (j = 0; j < ndoms_cur; j++) { 7835 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7839 if (cpus_equal(doms_new[i], doms_cur[j]) 7836 if (cpus_equal(doms_new[i], doms_cur[j])
7840 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7837 && dattrs_equal(dattr_new, i, dattr_cur, j))
7841 goto match2; 7838 goto match2;
@@ -8490,7 +8487,7 @@ static
8490int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8487int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8491{ 8488{
8492 struct cfs_rq *cfs_rq; 8489 struct cfs_rq *cfs_rq;
8493 struct sched_entity *se, *parent_se; 8490 struct sched_entity *se;
8494 struct rq *rq; 8491 struct rq *rq;
8495 int i; 8492 int i;
8496 8493
@@ -8506,18 +8503,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8506 for_each_possible_cpu(i) { 8503 for_each_possible_cpu(i) {
8507 rq = cpu_rq(i); 8504 rq = cpu_rq(i);
8508 8505
8509 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8506 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8510 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8507 GFP_KERNEL, cpu_to_node(i));
8511 if (!cfs_rq) 8508 if (!cfs_rq)
8512 goto err; 8509 goto err;
8513 8510
8514 se = kmalloc_node(sizeof(struct sched_entity), 8511 se = kzalloc_node(sizeof(struct sched_entity),
8515 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8512 GFP_KERNEL, cpu_to_node(i));
8516 if (!se) 8513 if (!se)
8517 goto err; 8514 goto err;
8518 8515
8519 parent_se = parent ? parent->se[i] : NULL; 8516 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8520 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8521 } 8517 }
8522 8518
8523 return 1; 8519 return 1;
@@ -8578,7 +8574,7 @@ static
8578int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8574int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8579{ 8575{
8580 struct rt_rq *rt_rq; 8576 struct rt_rq *rt_rq;
8581 struct sched_rt_entity *rt_se, *parent_se; 8577 struct sched_rt_entity *rt_se;
8582 struct rq *rq; 8578 struct rq *rq;
8583 int i; 8579 int i;
8584 8580
@@ -8595,18 +8591,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8595 for_each_possible_cpu(i) { 8591 for_each_possible_cpu(i) {
8596 rq = cpu_rq(i); 8592 rq = cpu_rq(i);
8597 8593
8598 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8594 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8599 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8595 GFP_KERNEL, cpu_to_node(i));
8600 if (!rt_rq) 8596 if (!rt_rq)
8601 goto err; 8597 goto err;
8602 8598
8603 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8599 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8604 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8600 GFP_KERNEL, cpu_to_node(i));
8605 if (!rt_se) 8601 if (!rt_se)
8606 goto err; 8602 goto err;
8607 8603
8608 parent_se = parent ? parent->rt_se[i] : NULL; 8604 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8609 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8610 } 8605 }
8611 8606
8612 return 1; 8607 return 1;
@@ -9249,11 +9244,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9249 * (balbir@in.ibm.com). 9244 * (balbir@in.ibm.com).
9250 */ 9245 */
9251 9246
9252/* track cpu usage of a group of tasks */ 9247/* track cpu usage of a group of tasks and its child groups */
9253struct cpuacct { 9248struct cpuacct {
9254 struct cgroup_subsys_state css; 9249 struct cgroup_subsys_state css;
9255 /* cpuusage holds pointer to a u64-type object on every cpu */ 9250 /* cpuusage holds pointer to a u64-type object on every cpu */
9256 u64 *cpuusage; 9251 u64 *cpuusage;
9252 struct cpuacct *parent;
9257}; 9253};
9258 9254
9259struct cgroup_subsys cpuacct_subsys; 9255struct cgroup_subsys cpuacct_subsys;
@@ -9287,6 +9283,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9287 return ERR_PTR(-ENOMEM); 9283 return ERR_PTR(-ENOMEM);
9288 } 9284 }
9289 9285
9286 if (cgrp->parent)
9287 ca->parent = cgroup_ca(cgrp->parent);
9288
9290 return &ca->css; 9289 return &ca->css;
9291} 9290}
9292 9291
@@ -9300,6 +9299,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9300 kfree(ca); 9299 kfree(ca);
9301} 9300}
9302 9301
9302static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9303{
9304 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9305 u64 data;
9306
9307#ifndef CONFIG_64BIT
9308 /*
9309 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9310 */
9311 spin_lock_irq(&cpu_rq(cpu)->lock);
9312 data = *cpuusage;
9313 spin_unlock_irq(&cpu_rq(cpu)->lock);
9314#else
9315 data = *cpuusage;
9316#endif
9317
9318 return data;
9319}
9320
9321static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9322{
9323 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9324
9325#ifndef CONFIG_64BIT
9326 /*
9327 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9328 */
9329 spin_lock_irq(&cpu_rq(cpu)->lock);
9330 *cpuusage = val;
9331 spin_unlock_irq(&cpu_rq(cpu)->lock);
9332#else
9333 *cpuusage = val;
9334#endif
9335}
9336
9303/* return total cpu usage (in nanoseconds) of a group */ 9337/* return total cpu usage (in nanoseconds) of a group */
9304static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9338static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9305{ 9339{
@@ -9307,17 +9341,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307 u64 totalcpuusage = 0; 9341 u64 totalcpuusage = 0;
9308 int i; 9342 int i;
9309 9343
9310 for_each_possible_cpu(i) { 9344 for_each_present_cpu(i)
9311 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9345 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9312
9313 /*
9314 * Take rq->lock to make 64-bit addition safe on 32-bit
9315 * platforms.
9316 */
9317 spin_lock_irq(&cpu_rq(i)->lock);
9318 totalcpuusage += *cpuusage;
9319 spin_unlock_irq(&cpu_rq(i)->lock);
9320 }
9321 9346
9322 return totalcpuusage; 9347 return totalcpuusage;
9323} 9348}
@@ -9334,23 +9359,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9334 goto out; 9359 goto out;
9335 } 9360 }
9336 9361
9337 for_each_possible_cpu(i) { 9362 for_each_present_cpu(i)
9338 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9363 cpuacct_cpuusage_write(ca, i, 0);
9339 9364
9340 spin_lock_irq(&cpu_rq(i)->lock);
9341 *cpuusage = 0;
9342 spin_unlock_irq(&cpu_rq(i)->lock);
9343 }
9344out: 9365out:
9345 return err; 9366 return err;
9346} 9367}
9347 9368
9369static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9370 struct seq_file *m)
9371{
9372 struct cpuacct *ca = cgroup_ca(cgroup);
9373 u64 percpu;
9374 int i;
9375
9376 for_each_present_cpu(i) {
9377 percpu = cpuacct_cpuusage_read(ca, i);
9378 seq_printf(m, "%llu ", (unsigned long long) percpu);
9379 }
9380 seq_printf(m, "\n");
9381 return 0;
9382}
9383
9348static struct cftype files[] = { 9384static struct cftype files[] = {
9349 { 9385 {
9350 .name = "usage", 9386 .name = "usage",
9351 .read_u64 = cpuusage_read, 9387 .read_u64 = cpuusage_read,
9352 .write_u64 = cpuusage_write, 9388 .write_u64 = cpuusage_write,
9353 }, 9389 },
9390 {
9391 .name = "usage_percpu",
9392 .read_seq_string = cpuacct_percpu_seq_read,
9393 },
9394
9354}; 9395};
9355 9396
9356static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9397static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9366,14 +9407,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9366static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9407static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9367{ 9408{
9368 struct cpuacct *ca; 9409 struct cpuacct *ca;
9410 int cpu;
9369 9411
9370 if (!cpuacct_subsys.active) 9412 if (!cpuacct_subsys.active)
9371 return; 9413 return;
9372 9414
9415 cpu = task_cpu(tsk);
9373 ca = task_ca(tsk); 9416 ca = task_ca(tsk);
9374 if (ca) {
9375 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9376 9417
9418 for (; ca; ca = ca->parent) {
9419 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9377 *cpuusage += cputime; 9420 *cpuusage += cputime;
9378 } 9421 }
9379} 9422}