aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-29 03:45:15 -0500
committerIngo Molnar <mingo@elte.hu>2008-12-29 03:45:15 -0500
commite1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (patch)
treebca1fcfef55b3e3e82c9a822b4ac6428fce2b419 /kernel/sched.c
parent2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 (diff)
parent3c92ec8ae91ecf59d88c798301833d7cf83f2179 (diff)
Merge branch 'linus' into perfcounters/core
Conflicts: fs/exec.c include/linux/init_task.h Simple context conflicts.
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c407
1 files changed, 225 insertions, 182 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d84ff4c8774..3dfbff5fb1ac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -261,6 +267,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 267 struct cgroup_subsys_state css;
262#endif 268#endif
263 269
270#ifdef CONFIG_USER_SCHED
271 uid_t uid;
272#endif
273
264#ifdef CONFIG_FAIR_GROUP_SCHED 274#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 275 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 276 struct sched_entity **se;
@@ -286,6 +296,12 @@ struct task_group {
286 296
287#ifdef CONFIG_USER_SCHED 297#ifdef CONFIG_USER_SCHED
288 298
299/* Helper function to pass uid information to create_sched_user() */
300void set_tg_uid(struct user_struct *user)
301{
302 user->tg->uid = user->uid;
303}
304
289/* 305/*
290 * Root task group. 306 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 307 * Every UID task group (including init_task_group aka UID-0) will
@@ -345,7 +361,9 @@ static inline struct task_group *task_group(struct task_struct *p)
345 struct task_group *tg; 361 struct task_group *tg;
346 362
347#ifdef CONFIG_USER_SCHED 363#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg; 364 rcu_read_lock();
365 tg = __task_cred(p)->user->tg;
366 rcu_read_unlock();
349#elif defined(CONFIG_CGROUP_SCHED) 367#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 368 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css); 369 struct task_group, css);
@@ -586,6 +604,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 604#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 605 /* latency stats */
588 struct sched_info rq_sched_info; 606 struct sched_info rq_sched_info;
607 unsigned long long rq_cpu_time;
608 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 609
590 /* sys_sched_yield() stats */ 610 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 611 unsigned int yld_exp_empty;
@@ -703,45 +723,18 @@ static __read_mostly char *sched_feat_names[] = {
703 723
704#undef SCHED_FEAT 724#undef SCHED_FEAT
705 725
706static int sched_feat_open(struct inode *inode, struct file *filp) 726static int sched_feat_show(struct seq_file *m, void *v)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{ 727{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 728 int i;
720 729
721 for (i = 0; sched_feat_names[i]; i++) { 730 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 731 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 732 seq_puts(m, "NO_");
724 } 733 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 734 }
735 seq_puts(m, "\n");
736 736
737 r += sprintf(buf + r, "\n"); 737 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 738}
746 739
747static ssize_t 740static ssize_t
@@ -786,10 +779,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 779 return cnt;
787} 780}
788 781
782static int sched_feat_open(struct inode *inode, struct file *filp)
783{
784 return single_open(filp, sched_feat_show, NULL);
785}
786
789static struct file_operations sched_feat_fops = { 787static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 788 .open = sched_feat_open,
791 .read = sched_feat_read, 789 .write = sched_feat_write,
792 .write = sched_feat_write, 790 .read = seq_read,
791 .llseek = seq_lseek,
792 .release = single_release,
793}; 793};
794 794
795static __init int sched_init_debug(void) 795static __init int sched_init_debug(void)
@@ -1494,27 +1494,13 @@ static void
1494update_group_shares_cpu(struct task_group *tg, int cpu, 1494update_group_shares_cpu(struct task_group *tg, int cpu,
1495 unsigned long sd_shares, unsigned long sd_rq_weight) 1495 unsigned long sd_shares, unsigned long sd_rq_weight)
1496{ 1496{
1497 int boost = 0;
1498 unsigned long shares; 1497 unsigned long shares;
1499 unsigned long rq_weight; 1498 unsigned long rq_weight;
1500 1499
1501 if (!tg->se[cpu]) 1500 if (!tg->se[cpu])
1502 return; 1501 return;
1503 1502
1504 rq_weight = tg->cfs_rq[cpu]->load.weight; 1503 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1505
1506 /*
1507 * If there are currently no tasks on the cpu pretend there is one of
1508 * average load so that when a new task gets to run here it will not
1509 * get delayed by group starvation.
1510 */
1511 if (!rq_weight) {
1512 boost = 1;
1513 rq_weight = NICE_0_LOAD;
1514 }
1515
1516 if (unlikely(rq_weight > sd_rq_weight))
1517 rq_weight = sd_rq_weight;
1518 1504
1519 /* 1505 /*
1520 * \Sum shares * rq_weight 1506 * \Sum shares * rq_weight
@@ -1522,7 +1508,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1522 * \Sum rq_weight 1508 * \Sum rq_weight
1523 * 1509 *
1524 */ 1510 */
1525 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1511 shares = (sd_shares * rq_weight) / sd_rq_weight;
1526 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1512 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1527 1513
1528 if (abs(shares - tg->se[cpu]->load.weight) > 1514 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1531,11 +1517,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1531 unsigned long flags; 1517 unsigned long flags;
1532 1518
1533 spin_lock_irqsave(&rq->lock, flags); 1519 spin_lock_irqsave(&rq->lock, flags);
1534 /* 1520 tg->cfs_rq[cpu]->shares = shares;
1535 * record the actual number of shares, not the boosted amount.
1536 */
1537 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1538 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1539 1521
1540 __set_se_shares(tg->se[cpu], shares); 1522 __set_se_shares(tg->se[cpu], shares);
1541 spin_unlock_irqrestore(&rq->lock, flags); 1523 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1549,13 +1531,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1549 */ 1531 */
1550static int tg_shares_up(struct task_group *tg, void *data) 1532static int tg_shares_up(struct task_group *tg, void *data)
1551{ 1533{
1552 unsigned long rq_weight = 0; 1534 unsigned long weight, rq_weight = 0;
1553 unsigned long shares = 0; 1535 unsigned long shares = 0;
1554 struct sched_domain *sd = data; 1536 struct sched_domain *sd = data;
1555 int i; 1537 int i;
1556 1538
1557 for_each_cpu_mask(i, sd->span) { 1539 for_each_cpu_mask(i, sd->span) {
1558 rq_weight += tg->cfs_rq[i]->load.weight; 1540 /*
1541 * If there are currently no tasks on the cpu pretend there
1542 * is one of average load so that when a new task gets to
1543 * run here it will not get delayed by group starvation.
1544 */
1545 weight = tg->cfs_rq[i]->load.weight;
1546 if (!weight)
1547 weight = NICE_0_LOAD;
1548
1549 tg->cfs_rq[i]->rq_weight = weight;
1550 rq_weight += weight;
1559 shares += tg->cfs_rq[i]->shares; 1551 shares += tg->cfs_rq[i]->shares;
1560 } 1552 }
1561 1553
@@ -1565,9 +1557,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1565 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1557 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1566 shares = tg->shares; 1558 shares = tg->shares;
1567 1559
1568 if (!rq_weight)
1569 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1570
1571 for_each_cpu_mask(i, sd->span) 1560 for_each_cpu_mask(i, sd->span)
1572 update_group_shares_cpu(tg, i, shares, rq_weight); 1561 update_group_shares_cpu(tg, i, shares, rq_weight);
1573 1562
@@ -1632,6 +1621,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1632 1621
1633#endif 1622#endif
1634 1623
1624/*
1625 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1626 */
1627static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1628 __releases(this_rq->lock)
1629 __acquires(busiest->lock)
1630 __acquires(this_rq->lock)
1631{
1632 int ret = 0;
1633
1634 if (unlikely(!irqs_disabled())) {
1635 /* printk() doesn't work good under rq->lock */
1636 spin_unlock(&this_rq->lock);
1637 BUG_ON(1);
1638 }
1639 if (unlikely(!spin_trylock(&busiest->lock))) {
1640 if (busiest < this_rq) {
1641 spin_unlock(&this_rq->lock);
1642 spin_lock(&busiest->lock);
1643 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1644 ret = 1;
1645 } else
1646 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1647 }
1648 return ret;
1649}
1650
1651static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1652 __releases(busiest->lock)
1653{
1654 spin_unlock(&busiest->lock);
1655 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1656}
1635#endif 1657#endif
1636 1658
1637#ifdef CONFIG_FAIR_GROUP_SCHED 1659#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1865,6 +1887,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1865 1887
1866 clock_offset = old_rq->clock - new_rq->clock; 1888 clock_offset = old_rq->clock - new_rq->clock;
1867 1889
1890 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1891
1868#ifdef CONFIG_SCHEDSTATS 1892#ifdef CONFIG_SCHEDSTATS
1869 if (p->se.wait_start) 1893 if (p->se.wait_start)
1870 p->se.wait_start -= clock_offset; 1894 p->se.wait_start -= clock_offset;
@@ -2297,6 +2321,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2297 2321
2298 smp_wmb(); 2322 smp_wmb();
2299 rq = task_rq_lock(p, &flags); 2323 rq = task_rq_lock(p, &flags);
2324 update_rq_clock(rq);
2300 old_state = p->state; 2325 old_state = p->state;
2301 if (!(old_state & state)) 2326 if (!(old_state & state))
2302 goto out; 2327 goto out;
@@ -2354,12 +2379,11 @@ out_activate:
2354 schedstat_inc(p, se.nr_wakeups_local); 2379 schedstat_inc(p, se.nr_wakeups_local);
2355 else 2380 else
2356 schedstat_inc(p, se.nr_wakeups_remote); 2381 schedstat_inc(p, se.nr_wakeups_remote);
2357 update_rq_clock(rq);
2358 activate_task(rq, p, 1); 2382 activate_task(rq, p, 1);
2359 success = 1; 2383 success = 1;
2360 2384
2361out_running: 2385out_running:
2362 trace_sched_wakeup(rq, p); 2386 trace_sched_wakeup(rq, p, success);
2363 check_preempt_curr(rq, p, sync); 2387 check_preempt_curr(rq, p, sync);
2364 2388
2365 p->state = TASK_RUNNING; 2389 p->state = TASK_RUNNING;
@@ -2493,7 +2517,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2493 p->sched_class->task_new(rq, p); 2517 p->sched_class->task_new(rq, p);
2494 inc_nr_running(rq); 2518 inc_nr_running(rq);
2495 } 2519 }
2496 trace_sched_wakeup_new(rq, p); 2520 trace_sched_wakeup_new(rq, p, 1);
2497 check_preempt_curr(rq, p, 0); 2521 check_preempt_curr(rq, p, 0);
2498#ifdef CONFIG_SMP 2522#ifdef CONFIG_SMP
2499 if (p->sched_class->task_wake_up) 2523 if (p->sched_class->task_wake_up)
@@ -2857,40 +2881,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2857} 2881}
2858 2882
2859/* 2883/*
2860 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2861 */
2862static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2863 __releases(this_rq->lock)
2864 __acquires(busiest->lock)
2865 __acquires(this_rq->lock)
2866{
2867 int ret = 0;
2868
2869 if (unlikely(!irqs_disabled())) {
2870 /* printk() doesn't work good under rq->lock */
2871 spin_unlock(&this_rq->lock);
2872 BUG_ON(1);
2873 }
2874 if (unlikely(!spin_trylock(&busiest->lock))) {
2875 if (busiest < this_rq) {
2876 spin_unlock(&this_rq->lock);
2877 spin_lock(&busiest->lock);
2878 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2879 ret = 1;
2880 } else
2881 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2882 }
2883 return ret;
2884}
2885
2886static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2887 __releases(busiest->lock)
2888{
2889 spin_unlock(&busiest->lock);
2890 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2891}
2892
2893/*
2894 * If dest_cpu is allowed for this process, migrate the task to it. 2884 * If dest_cpu is allowed for this process, migrate the task to it.
2895 * This is accomplished by forcing the cpu_allowed mask to only 2885 * This is accomplished by forcing the cpu_allowed mask to only
2896 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2886 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2907,7 +2897,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2907 || unlikely(!cpu_active(dest_cpu))) 2897 || unlikely(!cpu_active(dest_cpu)))
2908 goto out; 2898 goto out;
2909 2899
2910 trace_sched_migrate_task(rq, p, dest_cpu);
2911 /* force the process onto the specified CPU */ 2900 /* force the process onto the specified CPU */
2912 if (migrate_task(p, dest_cpu, &req)) { 2901 if (migrate_task(p, dest_cpu, &req)) {
2913 /* Need to wait for migration thread (might exit: take ref). */ 2902 /* Need to wait for migration thread (might exit: take ref). */
@@ -3752,7 +3741,7 @@ out_balanced:
3752static void idle_balance(int this_cpu, struct rq *this_rq) 3741static void idle_balance(int this_cpu, struct rq *this_rq)
3753{ 3742{
3754 struct sched_domain *sd; 3743 struct sched_domain *sd;
3755 int pulled_task = -1; 3744 int pulled_task = 0;
3756 unsigned long next_balance = jiffies + HZ; 3745 unsigned long next_balance = jiffies + HZ;
3757 cpumask_t tmpmask; 3746 cpumask_t tmpmask;
3758 3747
@@ -5204,6 +5193,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5204 set_load_weight(p); 5193 set_load_weight(p);
5205} 5194}
5206 5195
5196/*
5197 * check the target process has a UID that matches the current process's
5198 */
5199static bool check_same_owner(struct task_struct *p)
5200{
5201 const struct cred *cred = current_cred(), *pcred;
5202 bool match;
5203
5204 rcu_read_lock();
5205 pcred = __task_cred(p);
5206 match = (cred->euid == pcred->euid ||
5207 cred->euid == pcred->uid);
5208 rcu_read_unlock();
5209 return match;
5210}
5211
5207static int __sched_setscheduler(struct task_struct *p, int policy, 5212static int __sched_setscheduler(struct task_struct *p, int policy,
5208 struct sched_param *param, bool user) 5213 struct sched_param *param, bool user)
5209{ 5214{
@@ -5263,8 +5268,7 @@ recheck:
5263 return -EPERM; 5268 return -EPERM;
5264 5269
5265 /* can't change other user's priorities */ 5270 /* can't change other user's priorities */
5266 if ((current->euid != p->euid) && 5271 if (!check_same_owner(p))
5267 (current->euid != p->uid))
5268 return -EPERM; 5272 return -EPERM;
5269 } 5273 }
5270 5274
@@ -5496,8 +5500,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5496 read_unlock(&tasklist_lock); 5500 read_unlock(&tasklist_lock);
5497 5501
5498 retval = -EPERM; 5502 retval = -EPERM;
5499 if ((current->euid != p->euid) && (current->euid != p->uid) && 5503 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5500 !capable(CAP_SYS_NICE))
5501 goto out_unlock; 5504 goto out_unlock;
5502 5505
5503 retval = security_task_setscheduler(p, 0, NULL); 5506 retval = security_task_setscheduler(p, 0, NULL);
@@ -5966,6 +5969,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5966 * The idle tasks have their own, simple scheduling class: 5969 * The idle tasks have their own, simple scheduling class:
5967 */ 5970 */
5968 idle->sched_class = &idle_sched_class; 5971 idle->sched_class = &idle_sched_class;
5972 ftrace_graph_init_task(idle);
5969} 5973}
5970 5974
5971/* 5975/*
@@ -6196,7 +6200,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6196 6200
6197/* 6201/*
6198 * Figure out where task on dead CPU should go, use force if necessary. 6202 * Figure out where task on dead CPU should go, use force if necessary.
6199 * NOTE: interrupts should be disabled by the caller
6200 */ 6203 */
6201static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6204static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6202{ 6205{
@@ -6708,28 +6711,6 @@ early_initcall(migration_init);
6708 6711
6709#ifdef CONFIG_SCHED_DEBUG 6712#ifdef CONFIG_SCHED_DEBUG
6710 6713
6711static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6712{
6713 switch (lvl) {
6714 case SD_LV_NONE:
6715 return "NONE";
6716 case SD_LV_SIBLING:
6717 return "SIBLING";
6718 case SD_LV_MC:
6719 return "MC";
6720 case SD_LV_CPU:
6721 return "CPU";
6722 case SD_LV_NODE:
6723 return "NODE";
6724 case SD_LV_ALLNODES:
6725 return "ALLNODES";
6726 case SD_LV_MAX:
6727 return "MAX";
6728
6729 }
6730 return "MAX";
6731}
6732
6733static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6714static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6734 cpumask_t *groupmask) 6715 cpumask_t *groupmask)
6735{ 6716{
@@ -6749,8 +6730,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6749 return -1; 6730 return -1;
6750 } 6731 }
6751 6732
6752 printk(KERN_CONT "span %s level %s\n", 6733 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6753 str, sd_level_to_string(sd->level));
6754 6734
6755 if (!cpu_isset(cpu, sd->span)) { 6735 if (!cpu_isset(cpu, sd->span)) {
6756 printk(KERN_ERR "ERROR: domain->span does not contain " 6736 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6886,6 +6866,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6886 SD_BALANCE_EXEC | 6866 SD_BALANCE_EXEC |
6887 SD_SHARE_CPUPOWER | 6867 SD_SHARE_CPUPOWER |
6888 SD_SHARE_PKG_RESOURCES); 6868 SD_SHARE_PKG_RESOURCES);
6869 if (nr_node_ids == 1)
6870 pflags &= ~SD_SERIALIZE;
6889 } 6871 }
6890 if (~cflags & pflags) 6872 if (~cflags & pflags)
6891 return 0; 6873 return 0;
@@ -7406,13 +7388,21 @@ struct allmasks {
7406}; 7388};
7407 7389
7408#if NR_CPUS > 128 7390#if NR_CPUS > 128
7409#define SCHED_CPUMASK_ALLOC 1 7391#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7410#define SCHED_CPUMASK_FREE(v) kfree(v) 7392static inline void sched_cpumask_alloc(struct allmasks **masks)
7411#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7393{
7394 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7395}
7396static inline void sched_cpumask_free(struct allmasks *masks)
7397{
7398 kfree(masks);
7399}
7412#else 7400#else
7413#define SCHED_CPUMASK_ALLOC 0 7401#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7414#define SCHED_CPUMASK_FREE(v) 7402static inline void sched_cpumask_alloc(struct allmasks **masks)
7415#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7403{ }
7404static inline void sched_cpumask_free(struct allmasks *masks)
7405{ }
7416#endif 7406#endif
7417 7407
7418#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7408#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7488,9 +7478,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7488 return -ENOMEM; 7478 return -ENOMEM;
7489 } 7479 }
7490 7480
7491#if SCHED_CPUMASK_ALLOC
7492 /* get space for all scratch cpumask variables */ 7481 /* get space for all scratch cpumask variables */
7493 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7482 sched_cpumask_alloc(&allmasks);
7494 if (!allmasks) { 7483 if (!allmasks) {
7495 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7484 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7496 kfree(rd); 7485 kfree(rd);
@@ -7499,7 +7488,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7499#endif 7488#endif
7500 return -ENOMEM; 7489 return -ENOMEM;
7501 } 7490 }
7502#endif 7491
7503 tmpmask = (cpumask_t *)allmasks; 7492 tmpmask = (cpumask_t *)allmasks;
7504 7493
7505 7494
@@ -7753,13 +7742,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7753 cpu_attach_domain(sd, rd, i); 7742 cpu_attach_domain(sd, rd, i);
7754 } 7743 }
7755 7744
7756 SCHED_CPUMASK_FREE((void *)allmasks); 7745 sched_cpumask_free(allmasks);
7757 return 0; 7746 return 0;
7758 7747
7759#ifdef CONFIG_NUMA 7748#ifdef CONFIG_NUMA
7760error: 7749error:
7761 free_sched_groups(cpu_map, tmpmask); 7750 free_sched_groups(cpu_map, tmpmask);
7762 SCHED_CPUMASK_FREE((void *)allmasks); 7751 sched_cpumask_free(allmasks);
7763 kfree(rd); 7752 kfree(rd);
7764 return -ENOMEM; 7753 return -ENOMEM;
7765#endif 7754#endif
@@ -7782,8 +7771,14 @@ static struct sched_domain_attr *dattr_cur;
7782 */ 7771 */
7783static cpumask_t fallback_doms; 7772static cpumask_t fallback_doms;
7784 7773
7785void __attribute__((weak)) arch_update_cpu_topology(void) 7774/*
7775 * arch_update_cpu_topology lets virtualized architectures update the
7776 * cpu core maps. It is supposed to return 1 if the topology changed
7777 * or 0 if it stayed the same.
7778 */
7779int __attribute__((weak)) arch_update_cpu_topology(void)
7786{ 7780{
7781 return 0;
7787} 7782}
7788 7783
7789/* 7784/*
@@ -7823,8 +7818,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7823 cpumask_t tmpmask; 7818 cpumask_t tmpmask;
7824 int i; 7819 int i;
7825 7820
7826 unregister_sched_domain_sysctl();
7827
7828 for_each_cpu_mask_nr(i, *cpu_map) 7821 for_each_cpu_mask_nr(i, *cpu_map)
7829 cpu_attach_domain(NULL, &def_root_domain, i); 7822 cpu_attach_domain(NULL, &def_root_domain, i);
7830 synchronize_sched(); 7823 synchronize_sched();
@@ -7877,17 +7870,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7877 struct sched_domain_attr *dattr_new) 7870 struct sched_domain_attr *dattr_new)
7878{ 7871{
7879 int i, j, n; 7872 int i, j, n;
7873 int new_topology;
7880 7874
7881 mutex_lock(&sched_domains_mutex); 7875 mutex_lock(&sched_domains_mutex);
7882 7876
7883 /* always unregister in case we don't destroy any domains */ 7877 /* always unregister in case we don't destroy any domains */
7884 unregister_sched_domain_sysctl(); 7878 unregister_sched_domain_sysctl();
7885 7879
7880 /* Let architecture update cpu core mappings. */
7881 new_topology = arch_update_cpu_topology();
7882
7886 n = doms_new ? ndoms_new : 0; 7883 n = doms_new ? ndoms_new : 0;
7887 7884
7888 /* Destroy deleted domains */ 7885 /* Destroy deleted domains */
7889 for (i = 0; i < ndoms_cur; i++) { 7886 for (i = 0; i < ndoms_cur; i++) {
7890 for (j = 0; j < n; j++) { 7887 for (j = 0; j < n && !new_topology; j++) {
7891 if (cpus_equal(doms_cur[i], doms_new[j]) 7888 if (cpus_equal(doms_cur[i], doms_new[j])
7892 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7889 && dattrs_equal(dattr_cur, i, dattr_new, j))
7893 goto match1; 7890 goto match1;
@@ -7902,12 +7899,12 @@ match1:
7902 ndoms_cur = 0; 7899 ndoms_cur = 0;
7903 doms_new = &fallback_doms; 7900 doms_new = &fallback_doms;
7904 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7901 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7905 dattr_new = NULL; 7902 WARN_ON_ONCE(dattr_new);
7906 } 7903 }
7907 7904
7908 /* Build new domains */ 7905 /* Build new domains */
7909 for (i = 0; i < ndoms_new; i++) { 7906 for (i = 0; i < ndoms_new; i++) {
7910 for (j = 0; j < ndoms_cur; j++) { 7907 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7911 if (cpus_equal(doms_new[i], doms_cur[j]) 7908 if (cpus_equal(doms_new[i], doms_cur[j])
7912 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7909 && dattrs_equal(dattr_new, i, dattr_cur, j))
7913 goto match2; 7910 goto match2;
@@ -8562,7 +8559,7 @@ static
8562int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8559int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8563{ 8560{
8564 struct cfs_rq *cfs_rq; 8561 struct cfs_rq *cfs_rq;
8565 struct sched_entity *se, *parent_se; 8562 struct sched_entity *se;
8566 struct rq *rq; 8563 struct rq *rq;
8567 int i; 8564 int i;
8568 8565
@@ -8578,18 +8575,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8578 for_each_possible_cpu(i) { 8575 for_each_possible_cpu(i) {
8579 rq = cpu_rq(i); 8576 rq = cpu_rq(i);
8580 8577
8581 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8578 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8582 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8579 GFP_KERNEL, cpu_to_node(i));
8583 if (!cfs_rq) 8580 if (!cfs_rq)
8584 goto err; 8581 goto err;
8585 8582
8586 se = kmalloc_node(sizeof(struct sched_entity), 8583 se = kzalloc_node(sizeof(struct sched_entity),
8587 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8584 GFP_KERNEL, cpu_to_node(i));
8588 if (!se) 8585 if (!se)
8589 goto err; 8586 goto err;
8590 8587
8591 parent_se = parent ? parent->se[i] : NULL; 8588 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8592 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8593 } 8589 }
8594 8590
8595 return 1; 8591 return 1;
@@ -8650,7 +8646,7 @@ static
8650int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8646int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8651{ 8647{
8652 struct rt_rq *rt_rq; 8648 struct rt_rq *rt_rq;
8653 struct sched_rt_entity *rt_se, *parent_se; 8649 struct sched_rt_entity *rt_se;
8654 struct rq *rq; 8650 struct rq *rq;
8655 int i; 8651 int i;
8656 8652
@@ -8667,18 +8663,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8667 for_each_possible_cpu(i) { 8663 for_each_possible_cpu(i) {
8668 rq = cpu_rq(i); 8664 rq = cpu_rq(i);
8669 8665
8670 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8666 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8671 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8667 GFP_KERNEL, cpu_to_node(i));
8672 if (!rt_rq) 8668 if (!rt_rq)
8673 goto err; 8669 goto err;
8674 8670
8675 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8671 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8676 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8672 GFP_KERNEL, cpu_to_node(i));
8677 if (!rt_se) 8673 if (!rt_se)
8678 goto err; 8674 goto err;
8679 8675
8680 parent_se = parent ? parent->rt_se[i] : NULL; 8676 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8681 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8682 } 8677 }
8683 8678
8684 return 1; 8679 return 1;
@@ -9321,11 +9316,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9321 * (balbir@in.ibm.com). 9316 * (balbir@in.ibm.com).
9322 */ 9317 */
9323 9318
9324/* track cpu usage of a group of tasks */ 9319/* track cpu usage of a group of tasks and its child groups */
9325struct cpuacct { 9320struct cpuacct {
9326 struct cgroup_subsys_state css; 9321 struct cgroup_subsys_state css;
9327 /* cpuusage holds pointer to a u64-type object on every cpu */ 9322 /* cpuusage holds pointer to a u64-type object on every cpu */
9328 u64 *cpuusage; 9323 u64 *cpuusage;
9324 struct cpuacct *parent;
9329}; 9325};
9330 9326
9331struct cgroup_subsys cpuacct_subsys; 9327struct cgroup_subsys cpuacct_subsys;
@@ -9359,6 +9355,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9359 return ERR_PTR(-ENOMEM); 9355 return ERR_PTR(-ENOMEM);
9360 } 9356 }
9361 9357
9358 if (cgrp->parent)
9359 ca->parent = cgroup_ca(cgrp->parent);
9360
9362 return &ca->css; 9361 return &ca->css;
9363} 9362}
9364 9363
@@ -9372,6 +9371,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9372 kfree(ca); 9371 kfree(ca);
9373} 9372}
9374 9373
9374static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9375{
9376 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9377 u64 data;
9378
9379#ifndef CONFIG_64BIT
9380 /*
9381 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9382 */
9383 spin_lock_irq(&cpu_rq(cpu)->lock);
9384 data = *cpuusage;
9385 spin_unlock_irq(&cpu_rq(cpu)->lock);
9386#else
9387 data = *cpuusage;
9388#endif
9389
9390 return data;
9391}
9392
9393static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9394{
9395 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9396
9397#ifndef CONFIG_64BIT
9398 /*
9399 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9400 */
9401 spin_lock_irq(&cpu_rq(cpu)->lock);
9402 *cpuusage = val;
9403 spin_unlock_irq(&cpu_rq(cpu)->lock);
9404#else
9405 *cpuusage = val;
9406#endif
9407}
9408
9375/* return total cpu usage (in nanoseconds) of a group */ 9409/* return total cpu usage (in nanoseconds) of a group */
9376static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9410static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9377{ 9411{
@@ -9379,17 +9413,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9379 u64 totalcpuusage = 0; 9413 u64 totalcpuusage = 0;
9380 int i; 9414 int i;
9381 9415
9382 for_each_possible_cpu(i) { 9416 for_each_present_cpu(i)
9383 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9417 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9384
9385 /*
9386 * Take rq->lock to make 64-bit addition safe on 32-bit
9387 * platforms.
9388 */
9389 spin_lock_irq(&cpu_rq(i)->lock);
9390 totalcpuusage += *cpuusage;
9391 spin_unlock_irq(&cpu_rq(i)->lock);
9392 }
9393 9418
9394 return totalcpuusage; 9419 return totalcpuusage;
9395} 9420}
@@ -9406,23 +9431,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9406 goto out; 9431 goto out;
9407 } 9432 }
9408 9433
9409 for_each_possible_cpu(i) { 9434 for_each_present_cpu(i)
9410 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9435 cpuacct_cpuusage_write(ca, i, 0);
9411 9436
9412 spin_lock_irq(&cpu_rq(i)->lock);
9413 *cpuusage = 0;
9414 spin_unlock_irq(&cpu_rq(i)->lock);
9415 }
9416out: 9437out:
9417 return err; 9438 return err;
9418} 9439}
9419 9440
9441static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9442 struct seq_file *m)
9443{
9444 struct cpuacct *ca = cgroup_ca(cgroup);
9445 u64 percpu;
9446 int i;
9447
9448 for_each_present_cpu(i) {
9449 percpu = cpuacct_cpuusage_read(ca, i);
9450 seq_printf(m, "%llu ", (unsigned long long) percpu);
9451 }
9452 seq_printf(m, "\n");
9453 return 0;
9454}
9455
9420static struct cftype files[] = { 9456static struct cftype files[] = {
9421 { 9457 {
9422 .name = "usage", 9458 .name = "usage",
9423 .read_u64 = cpuusage_read, 9459 .read_u64 = cpuusage_read,
9424 .write_u64 = cpuusage_write, 9460 .write_u64 = cpuusage_write,
9425 }, 9461 },
9462 {
9463 .name = "usage_percpu",
9464 .read_seq_string = cpuacct_percpu_seq_read,
9465 },
9466
9426}; 9467};
9427 9468
9428static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9469static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9438,14 +9479,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9438static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9479static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9439{ 9480{
9440 struct cpuacct *ca; 9481 struct cpuacct *ca;
9482 int cpu;
9441 9483
9442 if (!cpuacct_subsys.active) 9484 if (!cpuacct_subsys.active)
9443 return; 9485 return;
9444 9486
9487 cpu = task_cpu(tsk);
9445 ca = task_ca(tsk); 9488 ca = task_ca(tsk);
9446 if (ca) {
9447 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9448 9489
9490 for (; ca; ca = ca->parent) {
9491 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9449 *cpuusage += cputime; 9492 *cpuusage += cputime;
9450 } 9493 }
9451} 9494}