aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2008-12-29 16:32:35 -0500
committerRusty Russell <rusty@rustcorp.com.au>2008-12-29 16:32:35 -0500
commit33edcf133ba93ecba2e4b6472e97b689895d805c (patch)
tree327d7a20acef64005e7c5ccbfa1265be28aeb6ac /kernel/sched.c
parentbe4d638c1597580ed2294d899d9f1a2cd10e462c (diff)
parent3c92ec8ae91ecf59d88c798301833d7cf83f2179 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c407
1 files changed, 225 insertions, 182 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 42929239830f..bdd180a0c6be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -261,6 +267,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 267 struct cgroup_subsys_state css;
262#endif 268#endif
263 269
270#ifdef CONFIG_USER_SCHED
271 uid_t uid;
272#endif
273
264#ifdef CONFIG_FAIR_GROUP_SCHED 274#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 275 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 276 struct sched_entity **se;
@@ -286,6 +296,12 @@ struct task_group {
286 296
287#ifdef CONFIG_USER_SCHED 297#ifdef CONFIG_USER_SCHED
288 298
299/* Helper function to pass uid information to create_sched_user() */
300void set_tg_uid(struct user_struct *user)
301{
302 user->tg->uid = user->uid;
303}
304
289/* 305/*
290 * Root task group. 306 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 307 * Every UID task group (including init_task_group aka UID-0) will
@@ -345,7 +361,9 @@ static inline struct task_group *task_group(struct task_struct *p)
345 struct task_group *tg; 361 struct task_group *tg;
346 362
347#ifdef CONFIG_USER_SCHED 363#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg; 364 rcu_read_lock();
365 tg = __task_cred(p)->user->tg;
366 rcu_read_unlock();
349#elif defined(CONFIG_CGROUP_SCHED) 367#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 368 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css); 369 struct task_group, css);
@@ -586,6 +604,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 604#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 605 /* latency stats */
588 struct sched_info rq_sched_info; 606 struct sched_info rq_sched_info;
607 unsigned long long rq_cpu_time;
608 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 609
590 /* sys_sched_yield() stats */ 610 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 611 unsigned int yld_exp_empty;
@@ -703,45 +723,18 @@ static __read_mostly char *sched_feat_names[] = {
703 723
704#undef SCHED_FEAT 724#undef SCHED_FEAT
705 725
706static int sched_feat_open(struct inode *inode, struct file *filp) 726static int sched_feat_show(struct seq_file *m, void *v)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{ 727{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 728 int i;
720 729
721 for (i = 0; sched_feat_names[i]; i++) { 730 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 731 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 732 seq_puts(m, "NO_");
724 } 733 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 734 }
735 seq_puts(m, "\n");
736 736
737 r += sprintf(buf + r, "\n"); 737 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 738}
746 739
747static ssize_t 740static ssize_t
@@ -786,10 +779,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 779 return cnt;
787} 780}
788 781
782static int sched_feat_open(struct inode *inode, struct file *filp)
783{
784 return single_open(filp, sched_feat_show, NULL);
785}
786
789static struct file_operations sched_feat_fops = { 787static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 788 .open = sched_feat_open,
791 .read = sched_feat_read, 789 .write = sched_feat_write,
792 .write = sched_feat_write, 790 .read = seq_read,
791 .llseek = seq_lseek,
792 .release = single_release,
793}; 793};
794 794
795static __init int sched_init_debug(void) 795static __init int sched_init_debug(void)
@@ -1474,27 +1474,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1474update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1475 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1476{
1477 int boost = 0;
1478 unsigned long shares; 1477 unsigned long shares;
1479 unsigned long rq_weight; 1478 unsigned long rq_weight;
1480 1479
1481 if (!tg->se[cpu]) 1480 if (!tg->se[cpu])
1482 return; 1481 return;
1483 1482
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1483 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1484
1499 /* 1485 /*
1500 * \Sum shares * rq_weight 1486 * \Sum shares * rq_weight
@@ -1502,7 +1488,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1488 * \Sum rq_weight
1503 * 1489 *
1504 */ 1490 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1491 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1492 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1493
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1494 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1497,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1497 unsigned long flags;
1512 1498
1513 spin_lock_irqsave(&rq->lock, flags); 1499 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1500 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1501
1520 __set_se_shares(tg->se[cpu], shares); 1502 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1503 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1511,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1511 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1512static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1513{
1532 unsigned long rq_weight = 0; 1514 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1515 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1516 struct sched_domain *sd = data;
1535 int i; 1517 int i;
1536 1518
1537 for_each_cpu_mask(i, sd->span) { 1519 for_each_cpu_mask(i, sd->span) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1520 /*
1521 * If there are currently no tasks on the cpu pretend there
1522 * is one of average load so that when a new task gets to
1523 * run here it will not get delayed by group starvation.
1524 */
1525 weight = tg->cfs_rq[i]->load.weight;
1526 if (!weight)
1527 weight = NICE_0_LOAD;
1528
1529 tg->cfs_rq[i]->rq_weight = weight;
1530 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1531 shares += tg->cfs_rq[i]->shares;
1540 } 1532 }
1541 1533
@@ -1545,9 +1537,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1537 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1538 shares = tg->shares;
1547 1539
1548 if (!rq_weight)
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span) 1540 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1541 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1542
@@ -1612,6 +1601,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1612 1601
1613#endif 1602#endif
1614 1603
1604/*
1605 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1606 */
1607static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1608 __releases(this_rq->lock)
1609 __acquires(busiest->lock)
1610 __acquires(this_rq->lock)
1611{
1612 int ret = 0;
1613
1614 if (unlikely(!irqs_disabled())) {
1615 /* printk() doesn't work good under rq->lock */
1616 spin_unlock(&this_rq->lock);
1617 BUG_ON(1);
1618 }
1619 if (unlikely(!spin_trylock(&busiest->lock))) {
1620 if (busiest < this_rq) {
1621 spin_unlock(&this_rq->lock);
1622 spin_lock(&busiest->lock);
1623 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1624 ret = 1;
1625 } else
1626 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1627 }
1628 return ret;
1629}
1630
1631static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1632 __releases(busiest->lock)
1633{
1634 spin_unlock(&busiest->lock);
1635 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1636}
1615#endif 1637#endif
1616 1638
1617#ifdef CONFIG_FAIR_GROUP_SCHED 1639#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1845,6 +1867,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1845 1867
1846 clock_offset = old_rq->clock - new_rq->clock; 1868 clock_offset = old_rq->clock - new_rq->clock;
1847 1869
1870 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1871
1848#ifdef CONFIG_SCHEDSTATS 1872#ifdef CONFIG_SCHEDSTATS
1849 if (p->se.wait_start) 1873 if (p->se.wait_start)
1850 p->se.wait_start -= clock_offset; 1874 p->se.wait_start -= clock_offset;
@@ -2254,6 +2278,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2254 2278
2255 smp_wmb(); 2279 smp_wmb();
2256 rq = task_rq_lock(p, &flags); 2280 rq = task_rq_lock(p, &flags);
2281 update_rq_clock(rq);
2257 old_state = p->state; 2282 old_state = p->state;
2258 if (!(old_state & state)) 2283 if (!(old_state & state))
2259 goto out; 2284 goto out;
@@ -2311,12 +2336,11 @@ out_activate:
2311 schedstat_inc(p, se.nr_wakeups_local); 2336 schedstat_inc(p, se.nr_wakeups_local);
2312 else 2337 else
2313 schedstat_inc(p, se.nr_wakeups_remote); 2338 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1); 2339 activate_task(rq, p, 1);
2316 success = 1; 2340 success = 1;
2317 2341
2318out_running: 2342out_running:
2319 trace_sched_wakeup(rq, p); 2343 trace_sched_wakeup(rq, p, success);
2320 check_preempt_curr(rq, p, sync); 2344 check_preempt_curr(rq, p, sync);
2321 2345
2322 p->state = TASK_RUNNING; 2346 p->state = TASK_RUNNING;
@@ -2449,7 +2473,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2449 p->sched_class->task_new(rq, p); 2473 p->sched_class->task_new(rq, p);
2450 inc_nr_running(rq); 2474 inc_nr_running(rq);
2451 } 2475 }
2452 trace_sched_wakeup_new(rq, p); 2476 trace_sched_wakeup_new(rq, p, 1);
2453 check_preempt_curr(rq, p, 0); 2477 check_preempt_curr(rq, p, 0);
2454#ifdef CONFIG_SMP 2478#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up) 2479 if (p->sched_class->task_wake_up)
@@ -2812,40 +2836,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2812} 2836}
2813 2837
2814/* 2838/*
2815 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2816 */
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825 /* printk() doesn't work good under rq->lock */
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848/*
2849 * If dest_cpu is allowed for this process, migrate the task to it. 2839 * If dest_cpu is allowed for this process, migrate the task to it.
2850 * This is accomplished by forcing the cpu_allowed mask to only 2840 * This is accomplished by forcing the cpu_allowed mask to only
2851 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2841 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2862,7 +2852,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2862 || unlikely(!cpu_active(dest_cpu))) 2852 || unlikely(!cpu_active(dest_cpu)))
2863 goto out; 2853 goto out;
2864 2854
2865 trace_sched_migrate_task(rq, p, dest_cpu);
2866 /* force the process onto the specified CPU */ 2855 /* force the process onto the specified CPU */
2867 if (migrate_task(p, dest_cpu, &req)) { 2856 if (migrate_task(p, dest_cpu, &req)) {
2868 /* Need to wait for migration thread (might exit: take ref). */ 2857 /* Need to wait for migration thread (might exit: take ref). */
@@ -3707,7 +3696,7 @@ out_balanced:
3707static void idle_balance(int this_cpu, struct rq *this_rq) 3696static void idle_balance(int this_cpu, struct rq *this_rq)
3708{ 3697{
3709 struct sched_domain *sd; 3698 struct sched_domain *sd;
3710 int pulled_task = -1; 3699 int pulled_task = 0;
3711 unsigned long next_balance = jiffies + HZ; 3700 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3701 cpumask_t tmpmask;
3713 3702
@@ -5134,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5134 set_load_weight(p); 5123 set_load_weight(p);
5135} 5124}
5136 5125
5126/*
5127 * check the target process has a UID that matches the current process's
5128 */
5129static bool check_same_owner(struct task_struct *p)
5130{
5131 const struct cred *cred = current_cred(), *pcred;
5132 bool match;
5133
5134 rcu_read_lock();
5135 pcred = __task_cred(p);
5136 match = (cred->euid == pcred->euid ||
5137 cred->euid == pcred->uid);
5138 rcu_read_unlock();
5139 return match;
5140}
5141
5137static int __sched_setscheduler(struct task_struct *p, int policy, 5142static int __sched_setscheduler(struct task_struct *p, int policy,
5138 struct sched_param *param, bool user) 5143 struct sched_param *param, bool user)
5139{ 5144{
@@ -5193,8 +5198,7 @@ recheck:
5193 return -EPERM; 5198 return -EPERM;
5194 5199
5195 /* can't change other user's priorities */ 5200 /* can't change other user's priorities */
5196 if ((current->euid != p->euid) && 5201 if (!check_same_owner(p))
5197 (current->euid != p->uid))
5198 return -EPERM; 5202 return -EPERM;
5199 } 5203 }
5200 5204
@@ -5426,8 +5430,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5426 read_unlock(&tasklist_lock); 5430 read_unlock(&tasklist_lock);
5427 5431
5428 retval = -EPERM; 5432 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) && 5433 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5430 !capable(CAP_SYS_NICE))
5431 goto out_unlock; 5434 goto out_unlock;
5432 5435
5433 retval = security_task_setscheduler(p, 0, NULL); 5436 retval = security_task_setscheduler(p, 0, NULL);
@@ -5896,6 +5899,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5896 * The idle tasks have their own, simple scheduling class: 5899 * The idle tasks have their own, simple scheduling class:
5897 */ 5900 */
5898 idle->sched_class = &idle_sched_class; 5901 idle->sched_class = &idle_sched_class;
5902 ftrace_graph_init_task(idle);
5899} 5903}
5900 5904
5901/* 5905/*
@@ -6126,7 +6130,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6130
6127/* 6131/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6132 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6133 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6134static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6135{
@@ -6638,28 +6641,6 @@ early_initcall(migration_init);
6638 6641
6639#ifdef CONFIG_SCHED_DEBUG 6642#ifdef CONFIG_SCHED_DEBUG
6640 6643
6641static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6642{
6643 switch (lvl) {
6644 case SD_LV_NONE:
6645 return "NONE";
6646 case SD_LV_SIBLING:
6647 return "SIBLING";
6648 case SD_LV_MC:
6649 return "MC";
6650 case SD_LV_CPU:
6651 return "CPU";
6652 case SD_LV_NODE:
6653 return "NODE";
6654 case SD_LV_ALLNODES:
6655 return "ALLNODES";
6656 case SD_LV_MAX:
6657 return "MAX";
6658
6659 }
6660 return "MAX";
6661}
6662
6663static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6644static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6664 cpumask_t *groupmask) 6645 cpumask_t *groupmask)
6665{ 6646{
@@ -6679,8 +6660,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6679 return -1; 6660 return -1;
6680 } 6661 }
6681 6662
6682 printk(KERN_CONT "span %s level %s\n", 6663 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6683 str, sd_level_to_string(sd->level));
6684 6664
6685 if (!cpu_isset(cpu, sd->span)) { 6665 if (!cpu_isset(cpu, sd->span)) {
6686 printk(KERN_ERR "ERROR: domain->span does not contain " 6666 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6816,6 +6796,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6816 SD_BALANCE_EXEC | 6796 SD_BALANCE_EXEC |
6817 SD_SHARE_CPUPOWER | 6797 SD_SHARE_CPUPOWER |
6818 SD_SHARE_PKG_RESOURCES); 6798 SD_SHARE_PKG_RESOURCES);
6799 if (nr_node_ids == 1)
6800 pflags &= ~SD_SERIALIZE;
6819 } 6801 }
6820 if (~cflags & pflags) 6802 if (~cflags & pflags)
6821 return 0; 6803 return 0;
@@ -7336,13 +7318,21 @@ struct allmasks {
7336}; 7318};
7337 7319
7338#if NR_CPUS > 128 7320#if NR_CPUS > 128
7339#define SCHED_CPUMASK_ALLOC 1 7321#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7340#define SCHED_CPUMASK_FREE(v) kfree(v) 7322static inline void sched_cpumask_alloc(struct allmasks **masks)
7341#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7323{
7324 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7325}
7326static inline void sched_cpumask_free(struct allmasks *masks)
7327{
7328 kfree(masks);
7329}
7342#else 7330#else
7343#define SCHED_CPUMASK_ALLOC 0 7331#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7344#define SCHED_CPUMASK_FREE(v) 7332static inline void sched_cpumask_alloc(struct allmasks **masks)
7345#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7333{ }
7334static inline void sched_cpumask_free(struct allmasks *masks)
7335{ }
7346#endif 7336#endif
7347 7337
7348#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7338#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7418,9 +7408,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7418 return -ENOMEM; 7408 return -ENOMEM;
7419 } 7409 }
7420 7410
7421#if SCHED_CPUMASK_ALLOC
7422 /* get space for all scratch cpumask variables */ 7411 /* get space for all scratch cpumask variables */
7423 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7412 sched_cpumask_alloc(&allmasks);
7424 if (!allmasks) { 7413 if (!allmasks) {
7425 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7414 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7426 kfree(rd); 7415 kfree(rd);
@@ -7429,7 +7418,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7429#endif 7418#endif
7430 return -ENOMEM; 7419 return -ENOMEM;
7431 } 7420 }
7432#endif 7421
7433 tmpmask = (cpumask_t *)allmasks; 7422 tmpmask = (cpumask_t *)allmasks;
7434 7423
7435 7424
@@ -7683,13 +7672,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7683 cpu_attach_domain(sd, rd, i); 7672 cpu_attach_domain(sd, rd, i);
7684 } 7673 }
7685 7674
7686 SCHED_CPUMASK_FREE((void *)allmasks); 7675 sched_cpumask_free(allmasks);
7687 return 0; 7676 return 0;
7688 7677
7689#ifdef CONFIG_NUMA 7678#ifdef CONFIG_NUMA
7690error: 7679error:
7691 free_sched_groups(cpu_map, tmpmask); 7680 free_sched_groups(cpu_map, tmpmask);
7692 SCHED_CPUMASK_FREE((void *)allmasks); 7681 sched_cpumask_free(allmasks);
7693 kfree(rd); 7682 kfree(rd);
7694 return -ENOMEM; 7683 return -ENOMEM;
7695#endif 7684#endif
@@ -7712,8 +7701,14 @@ static struct sched_domain_attr *dattr_cur;
7712 */ 7701 */
7713static cpumask_t fallback_doms; 7702static cpumask_t fallback_doms;
7714 7703
7715void __attribute__((weak)) arch_update_cpu_topology(void) 7704/*
7705 * arch_update_cpu_topology lets virtualized architectures update the
7706 * cpu core maps. It is supposed to return 1 if the topology changed
7707 * or 0 if it stayed the same.
7708 */
7709int __attribute__((weak)) arch_update_cpu_topology(void)
7716{ 7710{
7711 return 0;
7717} 7712}
7718 7713
7719/* 7714/*
@@ -7753,8 +7748,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7753 cpumask_t tmpmask; 7748 cpumask_t tmpmask;
7754 int i; 7749 int i;
7755 7750
7756 unregister_sched_domain_sysctl();
7757
7758 for_each_cpu_mask_nr(i, *cpu_map) 7751 for_each_cpu_mask_nr(i, *cpu_map)
7759 cpu_attach_domain(NULL, &def_root_domain, i); 7752 cpu_attach_domain(NULL, &def_root_domain, i);
7760 synchronize_sched(); 7753 synchronize_sched();
@@ -7807,17 +7800,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7807 struct sched_domain_attr *dattr_new) 7800 struct sched_domain_attr *dattr_new)
7808{ 7801{
7809 int i, j, n; 7802 int i, j, n;
7803 int new_topology;
7810 7804
7811 mutex_lock(&sched_domains_mutex); 7805 mutex_lock(&sched_domains_mutex);
7812 7806
7813 /* always unregister in case we don't destroy any domains */ 7807 /* always unregister in case we don't destroy any domains */
7814 unregister_sched_domain_sysctl(); 7808 unregister_sched_domain_sysctl();
7815 7809
7810 /* Let architecture update cpu core mappings. */
7811 new_topology = arch_update_cpu_topology();
7812
7816 n = doms_new ? ndoms_new : 0; 7813 n = doms_new ? ndoms_new : 0;
7817 7814
7818 /* Destroy deleted domains */ 7815 /* Destroy deleted domains */
7819 for (i = 0; i < ndoms_cur; i++) { 7816 for (i = 0; i < ndoms_cur; i++) {
7820 for (j = 0; j < n; j++) { 7817 for (j = 0; j < n && !new_topology; j++) {
7821 if (cpus_equal(doms_cur[i], doms_new[j]) 7818 if (cpus_equal(doms_cur[i], doms_new[j])
7822 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7819 && dattrs_equal(dattr_cur, i, dattr_new, j))
7823 goto match1; 7820 goto match1;
@@ -7832,12 +7829,12 @@ match1:
7832 ndoms_cur = 0; 7829 ndoms_cur = 0;
7833 doms_new = &fallback_doms; 7830 doms_new = &fallback_doms;
7834 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7831 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7835 dattr_new = NULL; 7832 WARN_ON_ONCE(dattr_new);
7836 } 7833 }
7837 7834
7838 /* Build new domains */ 7835 /* Build new domains */
7839 for (i = 0; i < ndoms_new; i++) { 7836 for (i = 0; i < ndoms_new; i++) {
7840 for (j = 0; j < ndoms_cur; j++) { 7837 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7841 if (cpus_equal(doms_new[i], doms_cur[j]) 7838 if (cpus_equal(doms_new[i], doms_cur[j])
7842 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7839 && dattrs_equal(dattr_new, i, dattr_cur, j))
7843 goto match2; 7840 goto match2;
@@ -8492,7 +8489,7 @@ static
8492int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8489int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8493{ 8490{
8494 struct cfs_rq *cfs_rq; 8491 struct cfs_rq *cfs_rq;
8495 struct sched_entity *se, *parent_se; 8492 struct sched_entity *se;
8496 struct rq *rq; 8493 struct rq *rq;
8497 int i; 8494 int i;
8498 8495
@@ -8508,18 +8505,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8508 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8509 rq = cpu_rq(i); 8506 rq = cpu_rq(i);
8510 8507
8511 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8508 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8512 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8509 GFP_KERNEL, cpu_to_node(i));
8513 if (!cfs_rq) 8510 if (!cfs_rq)
8514 goto err; 8511 goto err;
8515 8512
8516 se = kmalloc_node(sizeof(struct sched_entity), 8513 se = kzalloc_node(sizeof(struct sched_entity),
8517 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8514 GFP_KERNEL, cpu_to_node(i));
8518 if (!se) 8515 if (!se)
8519 goto err; 8516 goto err;
8520 8517
8521 parent_se = parent ? parent->se[i] : NULL; 8518 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8522 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8523 } 8519 }
8524 8520
8525 return 1; 8521 return 1;
@@ -8580,7 +8576,7 @@ static
8580int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8576int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8581{ 8577{
8582 struct rt_rq *rt_rq; 8578 struct rt_rq *rt_rq;
8583 struct sched_rt_entity *rt_se, *parent_se; 8579 struct sched_rt_entity *rt_se;
8584 struct rq *rq; 8580 struct rq *rq;
8585 int i; 8581 int i;
8586 8582
@@ -8597,18 +8593,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8597 for_each_possible_cpu(i) { 8593 for_each_possible_cpu(i) {
8598 rq = cpu_rq(i); 8594 rq = cpu_rq(i);
8599 8595
8600 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8596 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8597 GFP_KERNEL, cpu_to_node(i));
8602 if (!rt_rq) 8598 if (!rt_rq)
8603 goto err; 8599 goto err;
8604 8600
8605 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8601 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8602 GFP_KERNEL, cpu_to_node(i));
8607 if (!rt_se) 8603 if (!rt_se)
8608 goto err; 8604 goto err;
8609 8605
8610 parent_se = parent ? parent->rt_se[i] : NULL; 8606 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8611 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8612 } 8607 }
8613 8608
8614 return 1; 8609 return 1;
@@ -9251,11 +9246,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9251 * (balbir@in.ibm.com). 9246 * (balbir@in.ibm.com).
9252 */ 9247 */
9253 9248
9254/* track cpu usage of a group of tasks */ 9249/* track cpu usage of a group of tasks and its child groups */
9255struct cpuacct { 9250struct cpuacct {
9256 struct cgroup_subsys_state css; 9251 struct cgroup_subsys_state css;
9257 /* cpuusage holds pointer to a u64-type object on every cpu */ 9252 /* cpuusage holds pointer to a u64-type object on every cpu */
9258 u64 *cpuusage; 9253 u64 *cpuusage;
9254 struct cpuacct *parent;
9259}; 9255};
9260 9256
9261struct cgroup_subsys cpuacct_subsys; 9257struct cgroup_subsys cpuacct_subsys;
@@ -9289,6 +9285,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9289 return ERR_PTR(-ENOMEM); 9285 return ERR_PTR(-ENOMEM);
9290 } 9286 }
9291 9287
9288 if (cgrp->parent)
9289 ca->parent = cgroup_ca(cgrp->parent);
9290
9292 return &ca->css; 9291 return &ca->css;
9293} 9292}
9294 9293
@@ -9302,6 +9301,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9302 kfree(ca); 9301 kfree(ca);
9303} 9302}
9304 9303
9304static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9305{
9306 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9307 u64 data;
9308
9309#ifndef CONFIG_64BIT
9310 /*
9311 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9312 */
9313 spin_lock_irq(&cpu_rq(cpu)->lock);
9314 data = *cpuusage;
9315 spin_unlock_irq(&cpu_rq(cpu)->lock);
9316#else
9317 data = *cpuusage;
9318#endif
9319
9320 return data;
9321}
9322
9323static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9324{
9325 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9326
9327#ifndef CONFIG_64BIT
9328 /*
9329 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9330 */
9331 spin_lock_irq(&cpu_rq(cpu)->lock);
9332 *cpuusage = val;
9333 spin_unlock_irq(&cpu_rq(cpu)->lock);
9334#else
9335 *cpuusage = val;
9336#endif
9337}
9338
9305/* return total cpu usage (in nanoseconds) of a group */ 9339/* return total cpu usage (in nanoseconds) of a group */
9306static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9340static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307{ 9341{
@@ -9309,17 +9343,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9309 u64 totalcpuusage = 0; 9343 u64 totalcpuusage = 0;
9310 int i; 9344 int i;
9311 9345
9312 for_each_possible_cpu(i) { 9346 for_each_present_cpu(i)
9313 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9347 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9314
9315 /*
9316 * Take rq->lock to make 64-bit addition safe on 32-bit
9317 * platforms.
9318 */
9319 spin_lock_irq(&cpu_rq(i)->lock);
9320 totalcpuusage += *cpuusage;
9321 spin_unlock_irq(&cpu_rq(i)->lock);
9322 }
9323 9348
9324 return totalcpuusage; 9349 return totalcpuusage;
9325} 9350}
@@ -9336,23 +9361,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9336 goto out; 9361 goto out;
9337 } 9362 }
9338 9363
9339 for_each_possible_cpu(i) { 9364 for_each_present_cpu(i)
9340 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9365 cpuacct_cpuusage_write(ca, i, 0);
9341 9366
9342 spin_lock_irq(&cpu_rq(i)->lock);
9343 *cpuusage = 0;
9344 spin_unlock_irq(&cpu_rq(i)->lock);
9345 }
9346out: 9367out:
9347 return err; 9368 return err;
9348} 9369}
9349 9370
9371static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9372 struct seq_file *m)
9373{
9374 struct cpuacct *ca = cgroup_ca(cgroup);
9375 u64 percpu;
9376 int i;
9377
9378 for_each_present_cpu(i) {
9379 percpu = cpuacct_cpuusage_read(ca, i);
9380 seq_printf(m, "%llu ", (unsigned long long) percpu);
9381 }
9382 seq_printf(m, "\n");
9383 return 0;
9384}
9385
9350static struct cftype files[] = { 9386static struct cftype files[] = {
9351 { 9387 {
9352 .name = "usage", 9388 .name = "usage",
9353 .read_u64 = cpuusage_read, 9389 .read_u64 = cpuusage_read,
9354 .write_u64 = cpuusage_write, 9390 .write_u64 = cpuusage_write,
9355 }, 9391 },
9392 {
9393 .name = "usage_percpu",
9394 .read_seq_string = cpuacct_percpu_seq_read,
9395 },
9396
9356}; 9397};
9357 9398
9358static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9399static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9368,14 +9409,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9368static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9409static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9369{ 9410{
9370 struct cpuacct *ca; 9411 struct cpuacct *ca;
9412 int cpu;
9371 9413
9372 if (!cpuacct_subsys.active) 9414 if (!cpuacct_subsys.active)
9373 return; 9415 return;
9374 9416
9417 cpu = task_cpu(tsk);
9375 ca = task_ca(tsk); 9418 ca = task_ca(tsk);
9376 if (ca) {
9377 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9378 9419
9420 for (; ca; ca = ca->parent) {
9421 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9379 *cpuusage += cputime; 9422 *cpuusage += cputime;
9380 } 9423 }
9381} 9424}