aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c540
1 files changed, 301 insertions, 239 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d897a524e7d8..c731dd820d1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
55#include <linux/cpuset.h> 55#include <linux/cpuset.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 59#include <linux/seq_file.h>
59#include <linux/sysctl.h> 60#include <linux/sysctl.h>
60#include <linux/syscalls.h> 61#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
71#include <linux/debugfs.h> 72#include <linux/debugfs.h>
72#include <linux/ctype.h> 73#include <linux/ctype.h>
73#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
74 76
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
@@ -116,6 +118,12 @@
116 */ 118 */
117#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
118 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
119#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
120/* 128/*
121 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -201,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 212}
206 213
207static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -226,9 +233,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
226 233
227 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 234 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
228 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 235 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
229 hrtimer_start(&rt_b->rt_period_timer, 236 hrtimer_start_expires(&rt_b->rt_period_timer,
230 rt_b->rt_period_timer.expires, 237 HRTIMER_MODE_ABS);
231 HRTIMER_MODE_ABS);
232 } 238 }
233 spin_unlock(&rt_b->rt_runtime_lock); 239 spin_unlock(&rt_b->rt_runtime_lock);
234} 240}
@@ -260,6 +266,10 @@ struct task_group {
260 struct cgroup_subsys_state css; 266 struct cgroup_subsys_state css;
261#endif 267#endif
262 268
269#ifdef CONFIG_USER_SCHED
270 uid_t uid;
271#endif
272
263#ifdef CONFIG_FAIR_GROUP_SCHED 273#ifdef CONFIG_FAIR_GROUP_SCHED
264 /* schedulable entities of this group on each cpu */ 274 /* schedulable entities of this group on each cpu */
265 struct sched_entity **se; 275 struct sched_entity **se;
@@ -285,6 +295,12 @@ struct task_group {
285 295
286#ifdef CONFIG_USER_SCHED 296#ifdef CONFIG_USER_SCHED
287 297
298/* Helper function to pass uid information to create_sched_user() */
299void set_tg_uid(struct user_struct *user)
300{
301 user->tg->uid = user->uid;
302}
303
288/* 304/*
289 * Root task group. 305 * Root task group.
290 * Every UID task group (including init_task_group aka UID-0) will 306 * Every UID task group (including init_task_group aka UID-0) will
@@ -344,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
344 struct task_group *tg; 360 struct task_group *tg;
345 361
346#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
347 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
348#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
349 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
350 struct task_group, css); 368 struct task_group, css);
@@ -385,7 +403,6 @@ struct cfs_rq {
385 403
386 u64 exec_clock; 404 u64 exec_clock;
387 u64 min_vruntime; 405 u64 min_vruntime;
388 u64 pair_start;
389 406
390 struct rb_root tasks_timeline; 407 struct rb_root tasks_timeline;
391 struct rb_node *rb_leftmost; 408 struct rb_node *rb_leftmost;
@@ -397,9 +414,9 @@ struct cfs_rq {
397 * 'curr' points to currently running entity on this cfs_rq. 414 * 'curr' points to currently running entity on this cfs_rq.
398 * It is set to NULL otherwise (i.e when none are currently running). 415 * It is set to NULL otherwise (i.e when none are currently running).
399 */ 416 */
400 struct sched_entity *curr, *next; 417 struct sched_entity *curr, *next, *last;
401 418
402 unsigned long nr_spread_over; 419 unsigned int nr_spread_over;
403 420
404#ifdef CONFIG_FAIR_GROUP_SCHED 421#ifdef CONFIG_FAIR_GROUP_SCHED
405 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 422 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -586,6 +603,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 603#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 604 /* latency stats */
588 struct sched_info rq_sched_info; 605 struct sched_info rq_sched_info;
606 unsigned long long rq_cpu_time;
607 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 608
590 /* sys_sched_yield() stats */ 609 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 610 unsigned int yld_exp_empty;
@@ -703,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
703 722
704#undef SCHED_FEAT 723#undef SCHED_FEAT
705 724
706static int sched_feat_open(struct inode *inode, struct file *filp) 725static int sched_feat_show(struct seq_file *m, void *v)
707{ 726{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 727 int i;
720 728
721 for (i = 0; sched_feat_names[i]; i++) { 729 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 730 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 731 seq_puts(m, "NO_");
732 seq_printf(m, "%s ", sched_feat_names[i]);
724 } 733 }
734 seq_puts(m, "\n");
725 735
726 buf = kmalloc(len + 2, GFP_KERNEL); 736 return 0;
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 }
736
737 r += sprintf(buf + r, "\n");
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 737}
746 738
747static ssize_t 739static ssize_t
@@ -786,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 778 return cnt;
787} 779}
788 780
781static int sched_feat_open(struct inode *inode, struct file *filp)
782{
783 return single_open(filp, sched_feat_show, NULL);
784}
785
789static struct file_operations sched_feat_fops = { 786static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 787 .open = sched_feat_open,
791 .read = sched_feat_read, 788 .write = sched_feat_write,
792 .write = sched_feat_write, 789 .read = seq_read,
790 .llseek = seq_lseek,
791 .release = single_release,
793}; 792};
794 793
795static __init int sched_init_debug(void) 794static __init int sched_init_debug(void)
@@ -818,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
818unsigned int sysctl_sched_shares_ratelimit = 250000; 817unsigned int sysctl_sched_shares_ratelimit = 250000;
819 818
820/* 819/*
820 * Inject some fuzzyness into changing the per-cpu group shares
821 * this avoids remote rq-locks at the expense of fairness.
822 * default: 4
823 */
824unsigned int sysctl_sched_shares_thresh = 4;
825
826/*
821 * period over which we measure -rt task cpu usage in us. 827 * period over which we measure -rt task cpu usage in us.
822 * default: 1s 828 * default: 1s
823 */ 829 */
@@ -962,6 +968,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
962 } 968 }
963} 969}
964 970
971void task_rq_unlock_wait(struct task_struct *p)
972{
973 struct rq *rq = task_rq(p);
974
975 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
976 spin_unlock_wait(&rq->lock);
977}
978
965static void __task_rq_unlock(struct rq *rq) 979static void __task_rq_unlock(struct rq *rq)
966 __releases(rq->lock) 980 __releases(rq->lock)
967{ 981{
@@ -1063,7 +1077,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1063 struct hrtimer *timer = &rq->hrtick_timer; 1077 struct hrtimer *timer = &rq->hrtick_timer;
1064 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1078 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1065 1079
1066 timer->expires = time; 1080 hrtimer_set_expires(timer, time);
1067 1081
1068 if (rq == this_rq()) { 1082 if (rq == this_rq()) {
1069 hrtimer_restart(timer); 1083 hrtimer_restart(timer);
@@ -1124,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
1124 1138
1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1139 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1126 rq->hrtick_timer.function = hrtick; 1140 rq->hrtick_timer.function = hrtick;
1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1128} 1141}
1129#else /* CONFIG_SCHED_HRTICK */ 1142#else /* CONFIG_SCHED_HRTICK */
1130static inline void hrtick_clear(struct rq *rq) 1143static inline void hrtick_clear(struct rq *rq)
@@ -1438,9 +1451,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1438static unsigned long cpu_avg_load_per_task(int cpu) 1451static unsigned long cpu_avg_load_per_task(int cpu)
1439{ 1452{
1440 struct rq *rq = cpu_rq(cpu); 1453 struct rq *rq = cpu_rq(cpu);
1454 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1441 1455
1442 if (rq->nr_running) 1456 if (nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running; 1457 rq->avg_load_per_task = rq->load.weight / nr_running;
1458 else
1459 rq->avg_load_per_task = 0;
1444 1460
1445 return rq->avg_load_per_task; 1461 return rq->avg_load_per_task;
1446} 1462}
@@ -1453,30 +1469,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1453 * Calculate and set the cpu's group shares. 1469 * Calculate and set the cpu's group shares.
1454 */ 1470 */
1455static void 1471static void
1456__update_group_shares_cpu(struct task_group *tg, int cpu, 1472update_group_shares_cpu(struct task_group *tg, int cpu,
1457 unsigned long sd_shares, unsigned long sd_rq_weight) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1458{ 1474{
1459 int boost = 0;
1460 unsigned long shares; 1475 unsigned long shares;
1461 unsigned long rq_weight; 1476 unsigned long rq_weight;
1462 1477
1463 if (!tg->se[cpu]) 1478 if (!tg->se[cpu])
1464 return; 1479 return;
1465 1480
1466 rq_weight = tg->cfs_rq[cpu]->load.weight; 1481 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1467
1468 /*
1469 * If there are currently no tasks on the cpu pretend there is one of
1470 * average load so that when a new task gets to run here it will not
1471 * get delayed by group starvation.
1472 */
1473 if (!rq_weight) {
1474 boost = 1;
1475 rq_weight = NICE_0_LOAD;
1476 }
1477
1478 if (unlikely(rq_weight > sd_rq_weight))
1479 rq_weight = sd_rq_weight;
1480 1482
1481 /* 1483 /*
1482 * \Sum shares * rq_weight 1484 * \Sum shares * rq_weight
@@ -1484,20 +1486,20 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1484 * \Sum rq_weight 1486 * \Sum rq_weight
1485 * 1487 *
1486 */ 1488 */
1487 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1489 shares = (sd_shares * rq_weight) / sd_rq_weight;
1490 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1488 1491
1489 /* 1492 if (abs(shares - tg->se[cpu]->load.weight) >
1490 * record the actual number of shares, not the boosted amount. 1493 sysctl_sched_shares_thresh) {
1491 */ 1494 struct rq *rq = cpu_rq(cpu);
1492 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1495 unsigned long flags;
1493 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1494 1496
1495 if (shares < MIN_SHARES) 1497 spin_lock_irqsave(&rq->lock, flags);
1496 shares = MIN_SHARES; 1498 tg->cfs_rq[cpu]->shares = shares;
1497 else if (shares > MAX_SHARES)
1498 shares = MAX_SHARES;
1499 1499
1500 __set_se_shares(tg->se[cpu], shares); 1500 __set_se_shares(tg->se[cpu], shares);
1501 spin_unlock_irqrestore(&rq->lock, flags);
1502 }
1501} 1503}
1502 1504
1503/* 1505/*
@@ -1507,13 +1509,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1507 */ 1509 */
1508static int tg_shares_up(struct task_group *tg, void *data) 1510static int tg_shares_up(struct task_group *tg, void *data)
1509{ 1511{
1510 unsigned long rq_weight = 0; 1512 unsigned long weight, rq_weight = 0;
1511 unsigned long shares = 0; 1513 unsigned long shares = 0;
1512 struct sched_domain *sd = data; 1514 struct sched_domain *sd = data;
1513 int i; 1515 int i;
1514 1516
1515 for_each_cpu_mask(i, sd->span) { 1517 for_each_cpu_mask(i, sd->span) {
1516 rq_weight += tg->cfs_rq[i]->load.weight; 1518 /*
1519 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to
1521 * run here it will not get delayed by group starvation.
1522 */
1523 weight = tg->cfs_rq[i]->load.weight;
1524 if (!weight)
1525 weight = NICE_0_LOAD;
1526
1527 tg->cfs_rq[i]->rq_weight = weight;
1528 rq_weight += weight;
1517 shares += tg->cfs_rq[i]->shares; 1529 shares += tg->cfs_rq[i]->shares;
1518 } 1530 }
1519 1531
@@ -1523,17 +1535,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
1523 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1524 shares = tg->shares; 1536 shares = tg->shares;
1525 1537
1526 if (!rq_weight) 1538 for_each_cpu_mask(i, sd->span)
1527 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; 1539 update_group_shares_cpu(tg, i, shares, rq_weight);
1528
1529 for_each_cpu_mask(i, sd->span) {
1530 struct rq *rq = cpu_rq(i);
1531 unsigned long flags;
1532
1533 spin_lock_irqsave(&rq->lock, flags);
1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1535 spin_unlock_irqrestore(&rq->lock, flags);
1536 }
1537 1540
1538 return 0; 1541 return 0;
1539} 1542}
@@ -1596,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1596 1599
1597#endif 1600#endif
1598 1601
1602/*
1603 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1604 */
1605static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1606 __releases(this_rq->lock)
1607 __acquires(busiest->lock)
1608 __acquires(this_rq->lock)
1609{
1610 int ret = 0;
1611
1612 if (unlikely(!irqs_disabled())) {
1613 /* printk() doesn't work good under rq->lock */
1614 spin_unlock(&this_rq->lock);
1615 BUG_ON(1);
1616 }
1617 if (unlikely(!spin_trylock(&busiest->lock))) {
1618 if (busiest < this_rq) {
1619 spin_unlock(&this_rq->lock);
1620 spin_lock(&busiest->lock);
1621 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1622 ret = 1;
1623 } else
1624 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1625 }
1626 return ret;
1627}
1628
1629static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1630 __releases(busiest->lock)
1631{
1632 spin_unlock(&busiest->lock);
1633 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1634}
1599#endif 1635#endif
1600 1636
1601#ifdef CONFIG_FAIR_GROUP_SCHED 1637#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1800,7 +1836,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1800 /* 1836 /*
1801 * Buddy candidates are cache hot: 1837 * Buddy candidates are cache hot:
1802 */ 1838 */
1803 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) 1839 if (sched_feat(CACHE_HOT_BUDDY) &&
1840 (&p->se == cfs_rq_of(&p->se)->next ||
1841 &p->se == cfs_rq_of(&p->se)->last))
1804 return 1; 1842 return 1;
1805 1843
1806 if (p->sched_class != &fair_sched_class) 1844 if (p->sched_class != &fair_sched_class)
@@ -1827,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1827 1865
1828 clock_offset = old_rq->clock - new_rq->clock; 1866 clock_offset = old_rq->clock - new_rq->clock;
1829 1867
1868 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1869
1830#ifdef CONFIG_SCHEDSTATS 1870#ifdef CONFIG_SCHEDSTATS
1831 if (p->se.wait_start) 1871 if (p->se.wait_start)
1832 p->se.wait_start -= clock_offset; 1872 p->se.wait_start -= clock_offset;
@@ -1936,6 +1976,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1936 * just go back and repeat. 1976 * just go back and repeat.
1937 */ 1977 */
1938 rq = task_rq_lock(p, &flags); 1978 rq = task_rq_lock(p, &flags);
1979 trace_sched_wait_task(rq, p);
1939 running = task_running(rq, p); 1980 running = task_running(rq, p);
1940 on_rq = p->se.on_rq; 1981 on_rq = p->se.on_rq;
1941 ncsw = 0; 1982 ncsw = 0;
@@ -2235,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2235 2276
2236 smp_wmb(); 2277 smp_wmb();
2237 rq = task_rq_lock(p, &flags); 2278 rq = task_rq_lock(p, &flags);
2279 update_rq_clock(rq);
2238 old_state = p->state; 2280 old_state = p->state;
2239 if (!(old_state & state)) 2281 if (!(old_state & state))
2240 goto out; 2282 goto out;
@@ -2292,14 +2334,11 @@ out_activate:
2292 schedstat_inc(p, se.nr_wakeups_local); 2334 schedstat_inc(p, se.nr_wakeups_local);
2293 else 2335 else
2294 schedstat_inc(p, se.nr_wakeups_remote); 2336 schedstat_inc(p, se.nr_wakeups_remote);
2295 update_rq_clock(rq);
2296 activate_task(rq, p, 1); 2337 activate_task(rq, p, 1);
2297 success = 1; 2338 success = 1;
2298 2339
2299out_running: 2340out_running:
2300 trace_mark(kernel_sched_wakeup, 2341 trace_sched_wakeup(rq, p, success);
2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2302 p->pid, p->state, rq, p, rq->curr);
2303 check_preempt_curr(rq, p, sync); 2342 check_preempt_curr(rq, p, sync);
2304 2343
2305 p->state = TASK_RUNNING; 2344 p->state = TASK_RUNNING;
@@ -2432,9 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2432 p->sched_class->task_new(rq, p); 2471 p->sched_class->task_new(rq, p);
2433 inc_nr_running(rq); 2472 inc_nr_running(rq);
2434 } 2473 }
2435 trace_mark(kernel_sched_wakeup_new, 2474 trace_sched_wakeup_new(rq, p, 1);
2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2437 p->pid, p->state, rq, p, rq->curr);
2438 check_preempt_curr(rq, p, 0); 2475 check_preempt_curr(rq, p, 0);
2439#ifdef CONFIG_SMP 2476#ifdef CONFIG_SMP
2440 if (p->sched_class->task_wake_up) 2477 if (p->sched_class->task_wake_up)
@@ -2607,11 +2644,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2607 struct mm_struct *mm, *oldmm; 2644 struct mm_struct *mm, *oldmm;
2608 2645
2609 prepare_task_switch(rq, prev, next); 2646 prepare_task_switch(rq, prev, next);
2610 trace_mark(kernel_sched_schedule, 2647 trace_sched_switch(rq, prev, next);
2611 "prev_pid %d next_pid %d prev_state %ld "
2612 "## rq %p prev %p next %p",
2613 prev->pid, next->pid, prev->state,
2614 rq, prev, next);
2615 mm = next->mm; 2648 mm = next->mm;
2616 oldmm = prev->active_mm; 2649 oldmm = prev->active_mm;
2617 /* 2650 /*
@@ -2801,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2801} 2834}
2802 2835
2803/* 2836/*
2804 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2805 */
2806static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2807 __releases(this_rq->lock)
2808 __acquires(busiest->lock)
2809 __acquires(this_rq->lock)
2810{
2811 int ret = 0;
2812
2813 if (unlikely(!irqs_disabled())) {
2814 /* printk() doesn't work good under rq->lock */
2815 spin_unlock(&this_rq->lock);
2816 BUG_ON(1);
2817 }
2818 if (unlikely(!spin_trylock(&busiest->lock))) {
2819 if (busiest < this_rq) {
2820 spin_unlock(&this_rq->lock);
2821 spin_lock(&busiest->lock);
2822 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2823 ret = 1;
2824 } else
2825 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2826 }
2827 return ret;
2828}
2829
2830static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2831 __releases(busiest->lock)
2832{
2833 spin_unlock(&busiest->lock);
2834 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2835}
2836
2837/*
2838 * If dest_cpu is allowed for this process, migrate the task to it. 2837 * If dest_cpu is allowed for this process, migrate the task to it.
2839 * This is accomplished by forcing the cpu_allowed mask to only 2838 * This is accomplished by forcing the cpu_allowed mask to only
2840 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2839 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -3344,7 +3343,7 @@ small_imbalance:
3344 } else 3343 } else
3345 this_load_per_task = cpu_avg_load_per_task(this_cpu); 3344 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3346 3345
3347 if (max_load - this_load + 2*busiest_load_per_task >= 3346 if (max_load - this_load + busiest_load_per_task >=
3348 busiest_load_per_task * imbn) { 3347 busiest_load_per_task * imbn) {
3349 *imbalance = busiest_load_per_task; 3348 *imbalance = busiest_load_per_task;
3350 return busiest; 3349 return busiest;
@@ -3695,7 +3694,7 @@ out_balanced:
3695static void idle_balance(int this_cpu, struct rq *this_rq) 3694static void idle_balance(int this_cpu, struct rq *this_rq)
3696{ 3695{
3697 struct sched_domain *sd; 3696 struct sched_domain *sd;
3698 int pulled_task = -1; 3697 int pulled_task = 0;
3699 unsigned long next_balance = jiffies + HZ; 3698 unsigned long next_balance = jiffies + HZ;
3700 cpumask_t tmpmask; 3699 cpumask_t tmpmask;
3701 3700
@@ -4052,23 +4051,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4052EXPORT_PER_CPU_SYMBOL(kstat); 4051EXPORT_PER_CPU_SYMBOL(kstat);
4053 4052
4054/* 4053/*
4055 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4054 * Return any ns on the sched_clock that have not yet been banked in
4056 * that have not yet been banked in case the task is currently running. 4055 * @p in case that task is currently running.
4057 */ 4056 */
4058unsigned long long task_sched_runtime(struct task_struct *p) 4057unsigned long long task_delta_exec(struct task_struct *p)
4059{ 4058{
4060 unsigned long flags; 4059 unsigned long flags;
4061 u64 ns, delta_exec;
4062 struct rq *rq; 4060 struct rq *rq;
4061 u64 ns = 0;
4063 4062
4064 rq = task_rq_lock(p, &flags); 4063 rq = task_rq_lock(p, &flags);
4065 ns = p->se.sum_exec_runtime; 4064
4066 if (task_current(rq, p)) { 4065 if (task_current(rq, p)) {
4066 u64 delta_exec;
4067
4067 update_rq_clock(rq); 4068 update_rq_clock(rq);
4068 delta_exec = rq->clock - p->se.exec_start; 4069 delta_exec = rq->clock - p->se.exec_start;
4069 if ((s64)delta_exec > 0) 4070 if ((s64)delta_exec > 0)
4070 ns += delta_exec; 4071 ns = delta_exec;
4071 } 4072 }
4073
4072 task_rq_unlock(rq, &flags); 4074 task_rq_unlock(rq, &flags);
4073 4075
4074 return ns; 4076 return ns;
@@ -4085,6 +4087,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4085 cputime64_t tmp; 4087 cputime64_t tmp;
4086 4088
4087 p->utime = cputime_add(p->utime, cputime); 4089 p->utime = cputime_add(p->utime, cputime);
4090 account_group_user_time(p, cputime);
4088 4091
4089 /* Add user time to cpustat. */ 4092 /* Add user time to cpustat. */
4090 tmp = cputime_to_cputime64(cputime); 4093 tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4112,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4109 tmp = cputime_to_cputime64(cputime); 4112 tmp = cputime_to_cputime64(cputime);
4110 4113
4111 p->utime = cputime_add(p->utime, cputime); 4114 p->utime = cputime_add(p->utime, cputime);
4115 account_group_user_time(p, cputime);
4112 p->gtime = cputime_add(p->gtime, cputime); 4116 p->gtime = cputime_add(p->gtime, cputime);
4113 4117
4114 cpustat->user = cputime64_add(cpustat->user, tmp); 4118 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4148,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4144 } 4148 }
4145 4149
4146 p->stime = cputime_add(p->stime, cputime); 4150 p->stime = cputime_add(p->stime, cputime);
4151 account_group_system_time(p, cputime);
4147 4152
4148 /* Add system time to cpustat. */ 4153 /* Add system time to cpustat. */
4149 tmp = cputime_to_cputime64(cputime); 4154 tmp = cputime_to_cputime64(cputime);
@@ -4320,7 +4325,7 @@ void __kprobes sub_preempt_count(int val)
4320 /* 4325 /*
4321 * Underflow? 4326 * Underflow?
4322 */ 4327 */
4323 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4328 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4324 return; 4329 return;
4325 /* 4330 /*
4326 * Is the spinlock portion underflowing? 4331 * Is the spinlock portion underflowing?
@@ -4441,12 +4446,8 @@ need_resched_nonpreemptible:
4441 if (sched_feat(HRTICK)) 4446 if (sched_feat(HRTICK))
4442 hrtick_clear(rq); 4447 hrtick_clear(rq);
4443 4448
4444 /* 4449 spin_lock_irq(&rq->lock);
4445 * Do the rq-clock update outside the rq lock:
4446 */
4447 local_irq_disable();
4448 update_rq_clock(rq); 4450 update_rq_clock(rq);
4449 spin_lock(&rq->lock);
4450 clear_tsk_need_resched(prev); 4451 clear_tsk_need_resched(prev);
4451 4452
4452 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4453 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -5119,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5119 set_load_weight(p); 5120 set_load_weight(p);
5120} 5121}
5121 5122
5123/*
5124 * check the target process has a UID that matches the current process's
5125 */
5126static bool check_same_owner(struct task_struct *p)
5127{
5128 const struct cred *cred = current_cred(), *pcred;
5129 bool match;
5130
5131 rcu_read_lock();
5132 pcred = __task_cred(p);
5133 match = (cred->euid == pcred->euid ||
5134 cred->euid == pcred->uid);
5135 rcu_read_unlock();
5136 return match;
5137}
5138
5122static int __sched_setscheduler(struct task_struct *p, int policy, 5139static int __sched_setscheduler(struct task_struct *p, int policy,
5123 struct sched_param *param, bool user) 5140 struct sched_param *param, bool user)
5124{ 5141{
@@ -5178,8 +5195,7 @@ recheck:
5178 return -EPERM; 5195 return -EPERM;
5179 5196
5180 /* can't change other user's priorities */ 5197 /* can't change other user's priorities */
5181 if ((current->euid != p->euid) && 5198 if (!check_same_owner(p))
5182 (current->euid != p->uid))
5183 return -EPERM; 5199 return -EPERM;
5184 } 5200 }
5185 5201
@@ -5411,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5411 read_unlock(&tasklist_lock); 5427 read_unlock(&tasklist_lock);
5412 5428
5413 retval = -EPERM; 5429 retval = -EPERM;
5414 if ((current->euid != p->euid) && (current->euid != p->uid) && 5430 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5415 !capable(CAP_SYS_NICE))
5416 goto out_unlock; 5431 goto out_unlock;
5417 5432
5418 retval = security_task_setscheduler(p, 0, NULL); 5433 retval = security_task_setscheduler(p, 0, NULL);
@@ -5851,6 +5866,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5851 struct rq *rq = cpu_rq(cpu); 5866 struct rq *rq = cpu_rq(cpu);
5852 unsigned long flags; 5867 unsigned long flags;
5853 5868
5869 spin_lock_irqsave(&rq->lock, flags);
5870
5854 __sched_fork(idle); 5871 __sched_fork(idle);
5855 idle->se.exec_start = sched_clock(); 5872 idle->se.exec_start = sched_clock();
5856 5873
@@ -5858,7 +5875,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5858 idle->cpus_allowed = cpumask_of_cpu(cpu); 5875 idle->cpus_allowed = cpumask_of_cpu(cpu);
5859 __set_task_cpu(idle, cpu); 5876 __set_task_cpu(idle, cpu);
5860 5877
5861 spin_lock_irqsave(&rq->lock, flags);
5862 rq->curr = rq->idle = idle; 5878 rq->curr = rq->idle = idle;
5863#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5879#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5864 idle->oncpu = 1; 5880 idle->oncpu = 1;
@@ -5875,6 +5891,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5875 * The idle tasks have their own, simple scheduling class: 5891 * The idle tasks have their own, simple scheduling class:
5876 */ 5892 */
5877 idle->sched_class = &idle_sched_class; 5893 idle->sched_class = &idle_sched_class;
5894 ftrace_graph_init_task(idle);
5878} 5895}
5879 5896
5880/* 5897/*
@@ -6105,7 +6122,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6105 6122
6106/* 6123/*
6107 * Figure out where task on dead CPU should go, use force if necessary. 6124 * Figure out where task on dead CPU should go, use force if necessary.
6108 * NOTE: interrupts should be disabled by the caller
6109 */ 6125 */
6110static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6126static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6111{ 6127{
@@ -6566,7 +6582,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6566 req = list_entry(rq->migration_queue.next, 6582 req = list_entry(rq->migration_queue.next,
6567 struct migration_req, list); 6583 struct migration_req, list);
6568 list_del_init(&req->list); 6584 list_del_init(&req->list);
6585 spin_unlock_irq(&rq->lock);
6569 complete(&req->done); 6586 complete(&req->done);
6587 spin_lock_irq(&rq->lock);
6570 } 6588 }
6571 spin_unlock_irq(&rq->lock); 6589 spin_unlock_irq(&rq->lock);
6572 break; 6590 break;
@@ -6615,28 +6633,6 @@ early_initcall(migration_init);
6615 6633
6616#ifdef CONFIG_SCHED_DEBUG 6634#ifdef CONFIG_SCHED_DEBUG
6617 6635
6618static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6619{
6620 switch (lvl) {
6621 case SD_LV_NONE:
6622 return "NONE";
6623 case SD_LV_SIBLING:
6624 return "SIBLING";
6625 case SD_LV_MC:
6626 return "MC";
6627 case SD_LV_CPU:
6628 return "CPU";
6629 case SD_LV_NODE:
6630 return "NODE";
6631 case SD_LV_ALLNODES:
6632 return "ALLNODES";
6633 case SD_LV_MAX:
6634 return "MAX";
6635
6636 }
6637 return "MAX";
6638}
6639
6640static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6636static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6641 cpumask_t *groupmask) 6637 cpumask_t *groupmask)
6642{ 6638{
@@ -6656,8 +6652,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6656 return -1; 6652 return -1;
6657 } 6653 }
6658 6654
6659 printk(KERN_CONT "span %s level %s\n", 6655 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6660 str, sd_level_to_string(sd->level));
6661 6656
6662 if (!cpu_isset(cpu, sd->span)) { 6657 if (!cpu_isset(cpu, sd->span)) {
6663 printk(KERN_ERR "ERROR: domain->span does not contain " 6658 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6793,6 +6788,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6793 SD_BALANCE_EXEC | 6788 SD_BALANCE_EXEC |
6794 SD_SHARE_CPUPOWER | 6789 SD_SHARE_CPUPOWER |
6795 SD_SHARE_PKG_RESOURCES); 6790 SD_SHARE_PKG_RESOURCES);
6791 if (nr_node_ids == 1)
6792 pflags &= ~SD_SERIALIZE;
6796 } 6793 }
6797 if (~cflags & pflags) 6794 if (~cflags & pflags)
6798 return 0; 6795 return 0;
@@ -6868,15 +6865,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6868 struct sched_domain *tmp; 6865 struct sched_domain *tmp;
6869 6866
6870 /* Remove the sched domains which do not contribute to scheduling. */ 6867 /* Remove the sched domains which do not contribute to scheduling. */
6871 for (tmp = sd; tmp; tmp = tmp->parent) { 6868 for (tmp = sd; tmp; ) {
6872 struct sched_domain *parent = tmp->parent; 6869 struct sched_domain *parent = tmp->parent;
6873 if (!parent) 6870 if (!parent)
6874 break; 6871 break;
6872
6875 if (sd_parent_degenerate(tmp, parent)) { 6873 if (sd_parent_degenerate(tmp, parent)) {
6876 tmp->parent = parent->parent; 6874 tmp->parent = parent->parent;
6877 if (parent->parent) 6875 if (parent->parent)
6878 parent->parent->child = tmp; 6876 parent->parent->child = tmp;
6879 } 6877 } else
6878 tmp = tmp->parent;
6880 } 6879 }
6881 6880
6882 if (sd && sd_degenerate(sd)) { 6881 if (sd && sd_degenerate(sd)) {
@@ -7311,13 +7310,21 @@ struct allmasks {
7311}; 7310};
7312 7311
7313#if NR_CPUS > 128 7312#if NR_CPUS > 128
7314#define SCHED_CPUMASK_ALLOC 1 7313#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7315#define SCHED_CPUMASK_FREE(v) kfree(v) 7314static inline void sched_cpumask_alloc(struct allmasks **masks)
7316#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7315{
7316 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7317}
7318static inline void sched_cpumask_free(struct allmasks *masks)
7319{
7320 kfree(masks);
7321}
7317#else 7322#else
7318#define SCHED_CPUMASK_ALLOC 0 7323#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7319#define SCHED_CPUMASK_FREE(v) 7324static inline void sched_cpumask_alloc(struct allmasks **masks)
7320#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7325{ }
7326static inline void sched_cpumask_free(struct allmasks *masks)
7327{ }
7321#endif 7328#endif
7322 7329
7323#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7330#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7393,9 +7400,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7393 return -ENOMEM; 7400 return -ENOMEM;
7394 } 7401 }
7395 7402
7396#if SCHED_CPUMASK_ALLOC
7397 /* get space for all scratch cpumask variables */ 7403 /* get space for all scratch cpumask variables */
7398 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7404 sched_cpumask_alloc(&allmasks);
7399 if (!allmasks) { 7405 if (!allmasks) {
7400 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7406 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7401 kfree(rd); 7407 kfree(rd);
@@ -7404,7 +7410,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7404#endif 7410#endif
7405 return -ENOMEM; 7411 return -ENOMEM;
7406 } 7412 }
7407#endif 7413
7408 tmpmask = (cpumask_t *)allmasks; 7414 tmpmask = (cpumask_t *)allmasks;
7409 7415
7410 7416
@@ -7658,13 +7664,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7658 cpu_attach_domain(sd, rd, i); 7664 cpu_attach_domain(sd, rd, i);
7659 } 7665 }
7660 7666
7661 SCHED_CPUMASK_FREE((void *)allmasks); 7667 sched_cpumask_free(allmasks);
7662 return 0; 7668 return 0;
7663 7669
7664#ifdef CONFIG_NUMA 7670#ifdef CONFIG_NUMA
7665error: 7671error:
7666 free_sched_groups(cpu_map, tmpmask); 7672 free_sched_groups(cpu_map, tmpmask);
7667 SCHED_CPUMASK_FREE((void *)allmasks); 7673 sched_cpumask_free(allmasks);
7674 kfree(rd);
7668 return -ENOMEM; 7675 return -ENOMEM;
7669#endif 7676#endif
7670} 7677}
@@ -7686,8 +7693,14 @@ static struct sched_domain_attr *dattr_cur;
7686 */ 7693 */
7687static cpumask_t fallback_doms; 7694static cpumask_t fallback_doms;
7688 7695
7689void __attribute__((weak)) arch_update_cpu_topology(void) 7696/*
7697 * arch_update_cpu_topology lets virtualized architectures update the
7698 * cpu core maps. It is supposed to return 1 if the topology changed
7699 * or 0 if it stayed the same.
7700 */
7701int __attribute__((weak)) arch_update_cpu_topology(void)
7690{ 7702{
7703 return 0;
7691} 7704}
7692 7705
7693/* 7706/*
@@ -7727,8 +7740,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7727 cpumask_t tmpmask; 7740 cpumask_t tmpmask;
7728 int i; 7741 int i;
7729 7742
7730 unregister_sched_domain_sysctl();
7731
7732 for_each_cpu_mask_nr(i, *cpu_map) 7743 for_each_cpu_mask_nr(i, *cpu_map)
7733 cpu_attach_domain(NULL, &def_root_domain, i); 7744 cpu_attach_domain(NULL, &def_root_domain, i);
7734 synchronize_sched(); 7745 synchronize_sched();
@@ -7766,13 +7777,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7766 * 7777 *
7767 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7778 * The passed in 'doms_new' should be kmalloc'd. This routine takes
7768 * ownership of it and will kfree it when done with it. If the caller 7779 * ownership of it and will kfree it when done with it. If the caller
7769 * failed the kmalloc call, then it can pass in doms_new == NULL, 7780 * failed the kmalloc call, then it can pass in doms_new == NULL &&
7770 * and partition_sched_domains() will fallback to the single partition 7781 * ndoms_new == 1, and partition_sched_domains() will fallback to
7771 * 'fallback_doms', it also forces the domains to be rebuilt. 7782 * the single partition 'fallback_doms', it also forces the domains
7783 * to be rebuilt.
7772 * 7784 *
7773 * If doms_new==NULL it will be replaced with cpu_online_map. 7785 * If doms_new == NULL it will be replaced with cpu_online_map.
7774 * ndoms_new==0 is a special case for destroying existing domains. 7786 * ndoms_new == 0 is a special case for destroying existing domains,
7775 * It will not create the default domain. 7787 * and it will not create the default domain.
7776 * 7788 *
7777 * Call with hotplug lock held 7789 * Call with hotplug lock held
7778 */ 7790 */
@@ -7780,17 +7792,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7780 struct sched_domain_attr *dattr_new) 7792 struct sched_domain_attr *dattr_new)
7781{ 7793{
7782 int i, j, n; 7794 int i, j, n;
7795 int new_topology;
7783 7796
7784 mutex_lock(&sched_domains_mutex); 7797 mutex_lock(&sched_domains_mutex);
7785 7798
7786 /* always unregister in case we don't destroy any domains */ 7799 /* always unregister in case we don't destroy any domains */
7787 unregister_sched_domain_sysctl(); 7800 unregister_sched_domain_sysctl();
7788 7801
7802 /* Let architecture update cpu core mappings. */
7803 new_topology = arch_update_cpu_topology();
7804
7789 n = doms_new ? ndoms_new : 0; 7805 n = doms_new ? ndoms_new : 0;
7790 7806
7791 /* Destroy deleted domains */ 7807 /* Destroy deleted domains */
7792 for (i = 0; i < ndoms_cur; i++) { 7808 for (i = 0; i < ndoms_cur; i++) {
7793 for (j = 0; j < n; j++) { 7809 for (j = 0; j < n && !new_topology; j++) {
7794 if (cpus_equal(doms_cur[i], doms_new[j]) 7810 if (cpus_equal(doms_cur[i], doms_new[j])
7795 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7811 && dattrs_equal(dattr_cur, i, dattr_new, j))
7796 goto match1; 7812 goto match1;
@@ -7805,12 +7821,12 @@ match1:
7805 ndoms_cur = 0; 7821 ndoms_cur = 0;
7806 doms_new = &fallback_doms; 7822 doms_new = &fallback_doms;
7807 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7823 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7808 dattr_new = NULL; 7824 WARN_ON_ONCE(dattr_new);
7809 } 7825 }
7810 7826
7811 /* Build new domains */ 7827 /* Build new domains */
7812 for (i = 0; i < ndoms_new; i++) { 7828 for (i = 0; i < ndoms_new; i++) {
7813 for (j = 0; j < ndoms_cur; j++) { 7829 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7814 if (cpus_equal(doms_new[i], doms_cur[j]) 7830 if (cpus_equal(doms_new[i], doms_cur[j])
7815 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7831 && dattrs_equal(dattr_new, i, dattr_cur, j))
7816 goto match2; 7832 goto match2;
@@ -8465,7 +8481,7 @@ static
8465int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8481int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8466{ 8482{
8467 struct cfs_rq *cfs_rq; 8483 struct cfs_rq *cfs_rq;
8468 struct sched_entity *se, *parent_se; 8484 struct sched_entity *se;
8469 struct rq *rq; 8485 struct rq *rq;
8470 int i; 8486 int i;
8471 8487
@@ -8481,18 +8497,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8481 for_each_possible_cpu(i) { 8497 for_each_possible_cpu(i) {
8482 rq = cpu_rq(i); 8498 rq = cpu_rq(i);
8483 8499
8484 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8500 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8485 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8501 GFP_KERNEL, cpu_to_node(i));
8486 if (!cfs_rq) 8502 if (!cfs_rq)
8487 goto err; 8503 goto err;
8488 8504
8489 se = kmalloc_node(sizeof(struct sched_entity), 8505 se = kzalloc_node(sizeof(struct sched_entity),
8490 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8506 GFP_KERNEL, cpu_to_node(i));
8491 if (!se) 8507 if (!se)
8492 goto err; 8508 goto err;
8493 8509
8494 parent_se = parent ? parent->se[i] : NULL; 8510 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8495 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8496 } 8511 }
8497 8512
8498 return 1; 8513 return 1;
@@ -8553,7 +8568,7 @@ static
8553int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8568int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8554{ 8569{
8555 struct rt_rq *rt_rq; 8570 struct rt_rq *rt_rq;
8556 struct sched_rt_entity *rt_se, *parent_se; 8571 struct sched_rt_entity *rt_se;
8557 struct rq *rq; 8572 struct rq *rq;
8558 int i; 8573 int i;
8559 8574
@@ -8570,18 +8585,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8570 for_each_possible_cpu(i) { 8585 for_each_possible_cpu(i) {
8571 rq = cpu_rq(i); 8586 rq = cpu_rq(i);
8572 8587
8573 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8588 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8574 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8589 GFP_KERNEL, cpu_to_node(i));
8575 if (!rt_rq) 8590 if (!rt_rq)
8576 goto err; 8591 goto err;
8577 8592
8578 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8593 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8579 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8594 GFP_KERNEL, cpu_to_node(i));
8580 if (!rt_se) 8595 if (!rt_se)
8581 goto err; 8596 goto err;
8582 8597
8583 parent_se = parent ? parent->rt_se[i] : NULL; 8598 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8584 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8585 } 8599 }
8586 8600
8587 return 1; 8601 return 1;
@@ -9224,11 +9238,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9224 * (balbir@in.ibm.com). 9238 * (balbir@in.ibm.com).
9225 */ 9239 */
9226 9240
9227/* track cpu usage of a group of tasks */ 9241/* track cpu usage of a group of tasks and its child groups */
9228struct cpuacct { 9242struct cpuacct {
9229 struct cgroup_subsys_state css; 9243 struct cgroup_subsys_state css;
9230 /* cpuusage holds pointer to a u64-type object on every cpu */ 9244 /* cpuusage holds pointer to a u64-type object on every cpu */
9231 u64 *cpuusage; 9245 u64 *cpuusage;
9246 struct cpuacct *parent;
9232}; 9247};
9233 9248
9234struct cgroup_subsys cpuacct_subsys; 9249struct cgroup_subsys cpuacct_subsys;
@@ -9262,6 +9277,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9262 return ERR_PTR(-ENOMEM); 9277 return ERR_PTR(-ENOMEM);
9263 } 9278 }
9264 9279
9280 if (cgrp->parent)
9281 ca->parent = cgroup_ca(cgrp->parent);
9282
9265 return &ca->css; 9283 return &ca->css;
9266} 9284}
9267 9285
@@ -9275,6 +9293,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9275 kfree(ca); 9293 kfree(ca);
9276} 9294}
9277 9295
9296static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9297{
9298 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9299 u64 data;
9300
9301#ifndef CONFIG_64BIT
9302 /*
9303 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9304 */
9305 spin_lock_irq(&cpu_rq(cpu)->lock);
9306 data = *cpuusage;
9307 spin_unlock_irq(&cpu_rq(cpu)->lock);
9308#else
9309 data = *cpuusage;
9310#endif
9311
9312 return data;
9313}
9314
9315static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9316{
9317 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9318
9319#ifndef CONFIG_64BIT
9320 /*
9321 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9322 */
9323 spin_lock_irq(&cpu_rq(cpu)->lock);
9324 *cpuusage = val;
9325 spin_unlock_irq(&cpu_rq(cpu)->lock);
9326#else
9327 *cpuusage = val;
9328#endif
9329}
9330
9278/* return total cpu usage (in nanoseconds) of a group */ 9331/* return total cpu usage (in nanoseconds) of a group */
9279static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9332static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9280{ 9333{
@@ -9282,17 +9335,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9282 u64 totalcpuusage = 0; 9335 u64 totalcpuusage = 0;
9283 int i; 9336 int i;
9284 9337
9285 for_each_possible_cpu(i) { 9338 for_each_present_cpu(i)
9286 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9339 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9287
9288 /*
9289 * Take rq->lock to make 64-bit addition safe on 32-bit
9290 * platforms.
9291 */
9292 spin_lock_irq(&cpu_rq(i)->lock);
9293 totalcpuusage += *cpuusage;
9294 spin_unlock_irq(&cpu_rq(i)->lock);
9295 }
9296 9340
9297 return totalcpuusage; 9341 return totalcpuusage;
9298} 9342}
@@ -9309,23 +9353,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9309 goto out; 9353 goto out;
9310 } 9354 }
9311 9355
9312 for_each_possible_cpu(i) { 9356 for_each_present_cpu(i)
9313 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9357 cpuacct_cpuusage_write(ca, i, 0);
9314 9358
9315 spin_lock_irq(&cpu_rq(i)->lock);
9316 *cpuusage = 0;
9317 spin_unlock_irq(&cpu_rq(i)->lock);
9318 }
9319out: 9359out:
9320 return err; 9360 return err;
9321} 9361}
9322 9362
9363static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9364 struct seq_file *m)
9365{
9366 struct cpuacct *ca = cgroup_ca(cgroup);
9367 u64 percpu;
9368 int i;
9369
9370 for_each_present_cpu(i) {
9371 percpu = cpuacct_cpuusage_read(ca, i);
9372 seq_printf(m, "%llu ", (unsigned long long) percpu);
9373 }
9374 seq_printf(m, "\n");
9375 return 0;
9376}
9377
9323static struct cftype files[] = { 9378static struct cftype files[] = {
9324 { 9379 {
9325 .name = "usage", 9380 .name = "usage",
9326 .read_u64 = cpuusage_read, 9381 .read_u64 = cpuusage_read,
9327 .write_u64 = cpuusage_write, 9382 .write_u64 = cpuusage_write,
9328 }, 9383 },
9384 {
9385 .name = "usage_percpu",
9386 .read_seq_string = cpuacct_percpu_seq_read,
9387 },
9388
9329}; 9389};
9330 9390
9331static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9391static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9341,14 +9401,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9341static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9401static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9342{ 9402{
9343 struct cpuacct *ca; 9403 struct cpuacct *ca;
9404 int cpu;
9344 9405
9345 if (!cpuacct_subsys.active) 9406 if (!cpuacct_subsys.active)
9346 return; 9407 return;
9347 9408
9409 cpu = task_cpu(tsk);
9348 ca = task_ca(tsk); 9410 ca = task_ca(tsk);
9349 if (ca) {
9350 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9351 9411
9412 for (; ca; ca = ca->parent) {
9413 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9352 *cpuusage += cputime; 9414 *cpuusage += cputime;
9353 } 9415 }
9354} 9416}