diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-12-31 02:31:57 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-12-31 02:31:57 -0500 |
commit | a9de18eb761f7c1c860964b2e5addc1a35c7e861 (patch) | |
tree | 886e75fdfd09690cd262ca69cb7f5d1d42b48602 /kernel/sched.c | |
parent | b2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (diff) | |
parent | 6a94cb73064c952255336cc57731904174b2c58f (diff) |
Merge branch 'linus' into stackprotector
Conflicts:
arch/x86/include/asm/pda.h
kernel/fork.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 540 |
1 files changed, 301 insertions, 239 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index d897a524e7d8..c731dd820d1a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
58 | #include <linux/proc_fs.h> | ||
58 | #include <linux/seq_file.h> | 59 | #include <linux/seq_file.h> |
59 | #include <linux/sysctl.h> | 60 | #include <linux/sysctl.h> |
60 | #include <linux/syscalls.h> | 61 | #include <linux/syscalls.h> |
@@ -71,6 +72,7 @@ | |||
71 | #include <linux/debugfs.h> | 72 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | 74 | #include <linux/ftrace.h> |
75 | #include <trace/sched.h> | ||
74 | 76 | ||
75 | #include <asm/tlb.h> | 77 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 78 | #include <asm/irq_regs.h> |
@@ -116,6 +118,12 @@ | |||
116 | */ | 118 | */ |
117 | #define RUNTIME_INF ((u64)~0ULL) | 119 | #define RUNTIME_INF ((u64)~0ULL) |
118 | 120 | ||
121 | DEFINE_TRACE(sched_wait_task); | ||
122 | DEFINE_TRACE(sched_wakeup); | ||
123 | DEFINE_TRACE(sched_wakeup_new); | ||
124 | DEFINE_TRACE(sched_switch); | ||
125 | DEFINE_TRACE(sched_migrate_task); | ||
126 | |||
119 | #ifdef CONFIG_SMP | 127 | #ifdef CONFIG_SMP |
120 | /* | 128 | /* |
121 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 129 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
@@ -201,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
201 | hrtimer_init(&rt_b->rt_period_timer, | 209 | hrtimer_init(&rt_b->rt_period_timer, |
202 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 210 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
203 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 211 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; | ||
205 | } | 212 | } |
206 | 213 | ||
207 | static inline int rt_bandwidth_enabled(void) | 214 | static inline int rt_bandwidth_enabled(void) |
@@ -226,9 +233,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
226 | 233 | ||
227 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | 234 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); |
228 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | 235 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); |
229 | hrtimer_start(&rt_b->rt_period_timer, | 236 | hrtimer_start_expires(&rt_b->rt_period_timer, |
230 | rt_b->rt_period_timer.expires, | 237 | HRTIMER_MODE_ABS); |
231 | HRTIMER_MODE_ABS); | ||
232 | } | 238 | } |
233 | spin_unlock(&rt_b->rt_runtime_lock); | 239 | spin_unlock(&rt_b->rt_runtime_lock); |
234 | } | 240 | } |
@@ -260,6 +266,10 @@ struct task_group { | |||
260 | struct cgroup_subsys_state css; | 266 | struct cgroup_subsys_state css; |
261 | #endif | 267 | #endif |
262 | 268 | ||
269 | #ifdef CONFIG_USER_SCHED | ||
270 | uid_t uid; | ||
271 | #endif | ||
272 | |||
263 | #ifdef CONFIG_FAIR_GROUP_SCHED | 273 | #ifdef CONFIG_FAIR_GROUP_SCHED |
264 | /* schedulable entities of this group on each cpu */ | 274 | /* schedulable entities of this group on each cpu */ |
265 | struct sched_entity **se; | 275 | struct sched_entity **se; |
@@ -285,6 +295,12 @@ struct task_group { | |||
285 | 295 | ||
286 | #ifdef CONFIG_USER_SCHED | 296 | #ifdef CONFIG_USER_SCHED |
287 | 297 | ||
298 | /* Helper function to pass uid information to create_sched_user() */ | ||
299 | void set_tg_uid(struct user_struct *user) | ||
300 | { | ||
301 | user->tg->uid = user->uid; | ||
302 | } | ||
303 | |||
288 | /* | 304 | /* |
289 | * Root task group. | 305 | * Root task group. |
290 | * Every UID task group (including init_task_group aka UID-0) will | 306 | * Every UID task group (including init_task_group aka UID-0) will |
@@ -344,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
344 | struct task_group *tg; | 360 | struct task_group *tg; |
345 | 361 | ||
346 | #ifdef CONFIG_USER_SCHED | 362 | #ifdef CONFIG_USER_SCHED |
347 | tg = p->user->tg; | 363 | rcu_read_lock(); |
364 | tg = __task_cred(p)->user->tg; | ||
365 | rcu_read_unlock(); | ||
348 | #elif defined(CONFIG_CGROUP_SCHED) | 366 | #elif defined(CONFIG_CGROUP_SCHED) |
349 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 367 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
350 | struct task_group, css); | 368 | struct task_group, css); |
@@ -385,7 +403,6 @@ struct cfs_rq { | |||
385 | 403 | ||
386 | u64 exec_clock; | 404 | u64 exec_clock; |
387 | u64 min_vruntime; | 405 | u64 min_vruntime; |
388 | u64 pair_start; | ||
389 | 406 | ||
390 | struct rb_root tasks_timeline; | 407 | struct rb_root tasks_timeline; |
391 | struct rb_node *rb_leftmost; | 408 | struct rb_node *rb_leftmost; |
@@ -397,9 +414,9 @@ struct cfs_rq { | |||
397 | * 'curr' points to currently running entity on this cfs_rq. | 414 | * 'curr' points to currently running entity on this cfs_rq. |
398 | * It is set to NULL otherwise (i.e when none are currently running). | 415 | * It is set to NULL otherwise (i.e when none are currently running). |
399 | */ | 416 | */ |
400 | struct sched_entity *curr, *next; | 417 | struct sched_entity *curr, *next, *last; |
401 | 418 | ||
402 | unsigned long nr_spread_over; | 419 | unsigned int nr_spread_over; |
403 | 420 | ||
404 | #ifdef CONFIG_FAIR_GROUP_SCHED | 421 | #ifdef CONFIG_FAIR_GROUP_SCHED |
405 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 422 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -586,6 +603,8 @@ struct rq { | |||
586 | #ifdef CONFIG_SCHEDSTATS | 603 | #ifdef CONFIG_SCHEDSTATS |
587 | /* latency stats */ | 604 | /* latency stats */ |
588 | struct sched_info rq_sched_info; | 605 | struct sched_info rq_sched_info; |
606 | unsigned long long rq_cpu_time; | ||
607 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
589 | 608 | ||
590 | /* sys_sched_yield() stats */ | 609 | /* sys_sched_yield() stats */ |
591 | unsigned int yld_exp_empty; | 610 | unsigned int yld_exp_empty; |
@@ -703,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = { | |||
703 | 722 | ||
704 | #undef SCHED_FEAT | 723 | #undef SCHED_FEAT |
705 | 724 | ||
706 | static int sched_feat_open(struct inode *inode, struct file *filp) | 725 | static int sched_feat_show(struct seq_file *m, void *v) |
707 | { | 726 | { |
708 | filp->private_data = inode->i_private; | ||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | static ssize_t | ||
713 | sched_feat_read(struct file *filp, char __user *ubuf, | ||
714 | size_t cnt, loff_t *ppos) | ||
715 | { | ||
716 | char *buf; | ||
717 | int r = 0; | ||
718 | int len = 0; | ||
719 | int i; | 727 | int i; |
720 | 728 | ||
721 | for (i = 0; sched_feat_names[i]; i++) { | 729 | for (i = 0; sched_feat_names[i]; i++) { |
722 | len += strlen(sched_feat_names[i]); | 730 | if (!(sysctl_sched_features & (1UL << i))) |
723 | len += 4; | 731 | seq_puts(m, "NO_"); |
732 | seq_printf(m, "%s ", sched_feat_names[i]); | ||
724 | } | 733 | } |
734 | seq_puts(m, "\n"); | ||
725 | 735 | ||
726 | buf = kmalloc(len + 2, GFP_KERNEL); | 736 | return 0; |
727 | if (!buf) | ||
728 | return -ENOMEM; | ||
729 | |||
730 | for (i = 0; sched_feat_names[i]; i++) { | ||
731 | if (sysctl_sched_features & (1UL << i)) | ||
732 | r += sprintf(buf + r, "%s ", sched_feat_names[i]); | ||
733 | else | ||
734 | r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); | ||
735 | } | ||
736 | |||
737 | r += sprintf(buf + r, "\n"); | ||
738 | WARN_ON(r >= len + 2); | ||
739 | |||
740 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
741 | |||
742 | kfree(buf); | ||
743 | |||
744 | return r; | ||
745 | } | 737 | } |
746 | 738 | ||
747 | static ssize_t | 739 | static ssize_t |
@@ -786,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
786 | return cnt; | 778 | return cnt; |
787 | } | 779 | } |
788 | 780 | ||
781 | static int sched_feat_open(struct inode *inode, struct file *filp) | ||
782 | { | ||
783 | return single_open(filp, sched_feat_show, NULL); | ||
784 | } | ||
785 | |||
789 | static struct file_operations sched_feat_fops = { | 786 | static struct file_operations sched_feat_fops = { |
790 | .open = sched_feat_open, | 787 | .open = sched_feat_open, |
791 | .read = sched_feat_read, | 788 | .write = sched_feat_write, |
792 | .write = sched_feat_write, | 789 | .read = seq_read, |
790 | .llseek = seq_lseek, | ||
791 | .release = single_release, | ||
793 | }; | 792 | }; |
794 | 793 | ||
795 | static __init int sched_init_debug(void) | 794 | static __init int sched_init_debug(void) |
@@ -818,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
818 | unsigned int sysctl_sched_shares_ratelimit = 250000; | 817 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
819 | 818 | ||
820 | /* | 819 | /* |
820 | * Inject some fuzzyness into changing the per-cpu group shares | ||
821 | * this avoids remote rq-locks at the expense of fairness. | ||
822 | * default: 4 | ||
823 | */ | ||
824 | unsigned int sysctl_sched_shares_thresh = 4; | ||
825 | |||
826 | /* | ||
821 | * period over which we measure -rt task cpu usage in us. | 827 | * period over which we measure -rt task cpu usage in us. |
822 | * default: 1s | 828 | * default: 1s |
823 | */ | 829 | */ |
@@ -962,6 +968,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
962 | } | 968 | } |
963 | } | 969 | } |
964 | 970 | ||
971 | void task_rq_unlock_wait(struct task_struct *p) | ||
972 | { | ||
973 | struct rq *rq = task_rq(p); | ||
974 | |||
975 | smp_mb(); /* spin-unlock-wait is not a full memory barrier */ | ||
976 | spin_unlock_wait(&rq->lock); | ||
977 | } | ||
978 | |||
965 | static void __task_rq_unlock(struct rq *rq) | 979 | static void __task_rq_unlock(struct rq *rq) |
966 | __releases(rq->lock) | 980 | __releases(rq->lock) |
967 | { | 981 | { |
@@ -1063,7 +1077,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
1063 | struct hrtimer *timer = &rq->hrtick_timer; | 1077 | struct hrtimer *timer = &rq->hrtick_timer; |
1064 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 1078 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
1065 | 1079 | ||
1066 | timer->expires = time; | 1080 | hrtimer_set_expires(timer, time); |
1067 | 1081 | ||
1068 | if (rq == this_rq()) { | 1082 | if (rq == this_rq()) { |
1069 | hrtimer_restart(timer); | 1083 | hrtimer_restart(timer); |
@@ -1124,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq) | |||
1124 | 1138 | ||
1125 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1139 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1126 | rq->hrtick_timer.function = hrtick; | 1140 | rq->hrtick_timer.function = hrtick; |
1127 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; | ||
1128 | } | 1141 | } |
1129 | #else /* CONFIG_SCHED_HRTICK */ | 1142 | #else /* CONFIG_SCHED_HRTICK */ |
1130 | static inline void hrtick_clear(struct rq *rq) | 1143 | static inline void hrtick_clear(struct rq *rq) |
@@ -1438,9 +1451,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1438 | static unsigned long cpu_avg_load_per_task(int cpu) | 1451 | static unsigned long cpu_avg_load_per_task(int cpu) |
1439 | { | 1452 | { |
1440 | struct rq *rq = cpu_rq(cpu); | 1453 | struct rq *rq = cpu_rq(cpu); |
1454 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
1441 | 1455 | ||
1442 | if (rq->nr_running) | 1456 | if (nr_running) |
1443 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | 1457 | rq->avg_load_per_task = rq->load.weight / nr_running; |
1458 | else | ||
1459 | rq->avg_load_per_task = 0; | ||
1444 | 1460 | ||
1445 | return rq->avg_load_per_task; | 1461 | return rq->avg_load_per_task; |
1446 | } | 1462 | } |
@@ -1453,30 +1469,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); | |||
1453 | * Calculate and set the cpu's group shares. | 1469 | * Calculate and set the cpu's group shares. |
1454 | */ | 1470 | */ |
1455 | static void | 1471 | static void |
1456 | __update_group_shares_cpu(struct task_group *tg, int cpu, | 1472 | update_group_shares_cpu(struct task_group *tg, int cpu, |
1457 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1473 | unsigned long sd_shares, unsigned long sd_rq_weight) |
1458 | { | 1474 | { |
1459 | int boost = 0; | ||
1460 | unsigned long shares; | 1475 | unsigned long shares; |
1461 | unsigned long rq_weight; | 1476 | unsigned long rq_weight; |
1462 | 1477 | ||
1463 | if (!tg->se[cpu]) | 1478 | if (!tg->se[cpu]) |
1464 | return; | 1479 | return; |
1465 | 1480 | ||
1466 | rq_weight = tg->cfs_rq[cpu]->load.weight; | 1481 | rq_weight = tg->cfs_rq[cpu]->rq_weight; |
1467 | |||
1468 | /* | ||
1469 | * If there are currently no tasks on the cpu pretend there is one of | ||
1470 | * average load so that when a new task gets to run here it will not | ||
1471 | * get delayed by group starvation. | ||
1472 | */ | ||
1473 | if (!rq_weight) { | ||
1474 | boost = 1; | ||
1475 | rq_weight = NICE_0_LOAD; | ||
1476 | } | ||
1477 | |||
1478 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1479 | rq_weight = sd_rq_weight; | ||
1480 | 1482 | ||
1481 | /* | 1483 | /* |
1482 | * \Sum shares * rq_weight | 1484 | * \Sum shares * rq_weight |
@@ -1484,20 +1486,20 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1484 | * \Sum rq_weight | 1486 | * \Sum rq_weight |
1485 | * | 1487 | * |
1486 | */ | 1488 | */ |
1487 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | 1489 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1490 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1488 | 1491 | ||
1489 | /* | 1492 | if (abs(shares - tg->se[cpu]->load.weight) > |
1490 | * record the actual number of shares, not the boosted amount. | 1493 | sysctl_sched_shares_thresh) { |
1491 | */ | 1494 | struct rq *rq = cpu_rq(cpu); |
1492 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | 1495 | unsigned long flags; |
1493 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1494 | 1496 | ||
1495 | if (shares < MIN_SHARES) | 1497 | spin_lock_irqsave(&rq->lock, flags); |
1496 | shares = MIN_SHARES; | 1498 | tg->cfs_rq[cpu]->shares = shares; |
1497 | else if (shares > MAX_SHARES) | ||
1498 | shares = MAX_SHARES; | ||
1499 | 1499 | ||
1500 | __set_se_shares(tg->se[cpu], shares); | 1500 | __set_se_shares(tg->se[cpu], shares); |
1501 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1502 | } | ||
1501 | } | 1503 | } |
1502 | 1504 | ||
1503 | /* | 1505 | /* |
@@ -1507,13 +1509,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1507 | */ | 1509 | */ |
1508 | static int tg_shares_up(struct task_group *tg, void *data) | 1510 | static int tg_shares_up(struct task_group *tg, void *data) |
1509 | { | 1511 | { |
1510 | unsigned long rq_weight = 0; | 1512 | unsigned long weight, rq_weight = 0; |
1511 | unsigned long shares = 0; | 1513 | unsigned long shares = 0; |
1512 | struct sched_domain *sd = data; | 1514 | struct sched_domain *sd = data; |
1513 | int i; | 1515 | int i; |
1514 | 1516 | ||
1515 | for_each_cpu_mask(i, sd->span) { | 1517 | for_each_cpu_mask(i, sd->span) { |
1516 | rq_weight += tg->cfs_rq[i]->load.weight; | 1518 | /* |
1519 | * If there are currently no tasks on the cpu pretend there | ||
1520 | * is one of average load so that when a new task gets to | ||
1521 | * run here it will not get delayed by group starvation. | ||
1522 | */ | ||
1523 | weight = tg->cfs_rq[i]->load.weight; | ||
1524 | if (!weight) | ||
1525 | weight = NICE_0_LOAD; | ||
1526 | |||
1527 | tg->cfs_rq[i]->rq_weight = weight; | ||
1528 | rq_weight += weight; | ||
1517 | shares += tg->cfs_rq[i]->shares; | 1529 | shares += tg->cfs_rq[i]->shares; |
1518 | } | 1530 | } |
1519 | 1531 | ||
@@ -1523,17 +1535,8 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1523 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | 1535 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) |
1524 | shares = tg->shares; | 1536 | shares = tg->shares; |
1525 | 1537 | ||
1526 | if (!rq_weight) | 1538 | for_each_cpu_mask(i, sd->span) |
1527 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | 1539 | update_group_shares_cpu(tg, i, shares, rq_weight); |
1528 | |||
1529 | for_each_cpu_mask(i, sd->span) { | ||
1530 | struct rq *rq = cpu_rq(i); | ||
1531 | unsigned long flags; | ||
1532 | |||
1533 | spin_lock_irqsave(&rq->lock, flags); | ||
1534 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
1535 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1536 | } | ||
1537 | 1540 | ||
1538 | return 0; | 1541 | return 0; |
1539 | } | 1542 | } |
@@ -1596,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1596 | 1599 | ||
1597 | #endif | 1600 | #endif |
1598 | 1601 | ||
1602 | /* | ||
1603 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1604 | */ | ||
1605 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1606 | __releases(this_rq->lock) | ||
1607 | __acquires(busiest->lock) | ||
1608 | __acquires(this_rq->lock) | ||
1609 | { | ||
1610 | int ret = 0; | ||
1611 | |||
1612 | if (unlikely(!irqs_disabled())) { | ||
1613 | /* printk() doesn't work good under rq->lock */ | ||
1614 | spin_unlock(&this_rq->lock); | ||
1615 | BUG_ON(1); | ||
1616 | } | ||
1617 | if (unlikely(!spin_trylock(&busiest->lock))) { | ||
1618 | if (busiest < this_rq) { | ||
1619 | spin_unlock(&this_rq->lock); | ||
1620 | spin_lock(&busiest->lock); | ||
1621 | spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); | ||
1622 | ret = 1; | ||
1623 | } else | ||
1624 | spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); | ||
1625 | } | ||
1626 | return ret; | ||
1627 | } | ||
1628 | |||
1629 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1630 | __releases(busiest->lock) | ||
1631 | { | ||
1632 | spin_unlock(&busiest->lock); | ||
1633 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1634 | } | ||
1599 | #endif | 1635 | #endif |
1600 | 1636 | ||
1601 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1637 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1800,7 +1836,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
1800 | /* | 1836 | /* |
1801 | * Buddy candidates are cache hot: | 1837 | * Buddy candidates are cache hot: |
1802 | */ | 1838 | */ |
1803 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) | 1839 | if (sched_feat(CACHE_HOT_BUDDY) && |
1840 | (&p->se == cfs_rq_of(&p->se)->next || | ||
1841 | &p->se == cfs_rq_of(&p->se)->last)) | ||
1804 | return 1; | 1842 | return 1; |
1805 | 1843 | ||
1806 | if (p->sched_class != &fair_sched_class) | 1844 | if (p->sched_class != &fair_sched_class) |
@@ -1827,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1827 | 1865 | ||
1828 | clock_offset = old_rq->clock - new_rq->clock; | 1866 | clock_offset = old_rq->clock - new_rq->clock; |
1829 | 1867 | ||
1868 | trace_sched_migrate_task(p, task_cpu(p), new_cpu); | ||
1869 | |||
1830 | #ifdef CONFIG_SCHEDSTATS | 1870 | #ifdef CONFIG_SCHEDSTATS |
1831 | if (p->se.wait_start) | 1871 | if (p->se.wait_start) |
1832 | p->se.wait_start -= clock_offset; | 1872 | p->se.wait_start -= clock_offset; |
@@ -1936,6 +1976,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1936 | * just go back and repeat. | 1976 | * just go back and repeat. |
1937 | */ | 1977 | */ |
1938 | rq = task_rq_lock(p, &flags); | 1978 | rq = task_rq_lock(p, &flags); |
1979 | trace_sched_wait_task(rq, p); | ||
1939 | running = task_running(rq, p); | 1980 | running = task_running(rq, p); |
1940 | on_rq = p->se.on_rq; | 1981 | on_rq = p->se.on_rq; |
1941 | ncsw = 0; | 1982 | ncsw = 0; |
@@ -2235,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2235 | 2276 | ||
2236 | smp_wmb(); | 2277 | smp_wmb(); |
2237 | rq = task_rq_lock(p, &flags); | 2278 | rq = task_rq_lock(p, &flags); |
2279 | update_rq_clock(rq); | ||
2238 | old_state = p->state; | 2280 | old_state = p->state; |
2239 | if (!(old_state & state)) | 2281 | if (!(old_state & state)) |
2240 | goto out; | 2282 | goto out; |
@@ -2292,14 +2334,11 @@ out_activate: | |||
2292 | schedstat_inc(p, se.nr_wakeups_local); | 2334 | schedstat_inc(p, se.nr_wakeups_local); |
2293 | else | 2335 | else |
2294 | schedstat_inc(p, se.nr_wakeups_remote); | 2336 | schedstat_inc(p, se.nr_wakeups_remote); |
2295 | update_rq_clock(rq); | ||
2296 | activate_task(rq, p, 1); | 2337 | activate_task(rq, p, 1); |
2297 | success = 1; | 2338 | success = 1; |
2298 | 2339 | ||
2299 | out_running: | 2340 | out_running: |
2300 | trace_mark(kernel_sched_wakeup, | 2341 | trace_sched_wakeup(rq, p, success); |
2301 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2302 | p->pid, p->state, rq, p, rq->curr); | ||
2303 | check_preempt_curr(rq, p, sync); | 2342 | check_preempt_curr(rq, p, sync); |
2304 | 2343 | ||
2305 | p->state = TASK_RUNNING; | 2344 | p->state = TASK_RUNNING; |
@@ -2432,9 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2432 | p->sched_class->task_new(rq, p); | 2471 | p->sched_class->task_new(rq, p); |
2433 | inc_nr_running(rq); | 2472 | inc_nr_running(rq); |
2434 | } | 2473 | } |
2435 | trace_mark(kernel_sched_wakeup_new, | 2474 | trace_sched_wakeup_new(rq, p, 1); |
2436 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2437 | p->pid, p->state, rq, p, rq->curr); | ||
2438 | check_preempt_curr(rq, p, 0); | 2475 | check_preempt_curr(rq, p, 0); |
2439 | #ifdef CONFIG_SMP | 2476 | #ifdef CONFIG_SMP |
2440 | if (p->sched_class->task_wake_up) | 2477 | if (p->sched_class->task_wake_up) |
@@ -2607,11 +2644,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2607 | struct mm_struct *mm, *oldmm; | 2644 | struct mm_struct *mm, *oldmm; |
2608 | 2645 | ||
2609 | prepare_task_switch(rq, prev, next); | 2646 | prepare_task_switch(rq, prev, next); |
2610 | trace_mark(kernel_sched_schedule, | 2647 | trace_sched_switch(rq, prev, next); |
2611 | "prev_pid %d next_pid %d prev_state %ld " | ||
2612 | "## rq %p prev %p next %p", | ||
2613 | prev->pid, next->pid, prev->state, | ||
2614 | rq, prev, next); | ||
2615 | mm = next->mm; | 2648 | mm = next->mm; |
2616 | oldmm = prev->active_mm; | 2649 | oldmm = prev->active_mm; |
2617 | /* | 2650 | /* |
@@ -2801,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
2801 | } | 2834 | } |
2802 | 2835 | ||
2803 | /* | 2836 | /* |
2804 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
2805 | */ | ||
2806 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
2807 | __releases(this_rq->lock) | ||
2808 | __acquires(busiest->lock) | ||
2809 | __acquires(this_rq->lock) | ||
2810 | { | ||
2811 | int ret = 0; | ||
2812 | |||
2813 | if (unlikely(!irqs_disabled())) { | ||
2814 | /* printk() doesn't work good under rq->lock */ | ||
2815 | spin_unlock(&this_rq->lock); | ||
2816 | BUG_ON(1); | ||
2817 | } | ||
2818 | if (unlikely(!spin_trylock(&busiest->lock))) { | ||
2819 | if (busiest < this_rq) { | ||
2820 | spin_unlock(&this_rq->lock); | ||
2821 | spin_lock(&busiest->lock); | ||
2822 | spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); | ||
2823 | ret = 1; | ||
2824 | } else | ||
2825 | spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); | ||
2826 | } | ||
2827 | return ret; | ||
2828 | } | ||
2829 | |||
2830 | static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
2831 | __releases(busiest->lock) | ||
2832 | { | ||
2833 | spin_unlock(&busiest->lock); | ||
2834 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
2835 | } | ||
2836 | |||
2837 | /* | ||
2838 | * If dest_cpu is allowed for this process, migrate the task to it. | 2837 | * If dest_cpu is allowed for this process, migrate the task to it. |
2839 | * This is accomplished by forcing the cpu_allowed mask to only | 2838 | * This is accomplished by forcing the cpu_allowed mask to only |
2840 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 2839 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
@@ -3344,7 +3343,7 @@ small_imbalance: | |||
3344 | } else | 3343 | } else |
3345 | this_load_per_task = cpu_avg_load_per_task(this_cpu); | 3344 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3346 | 3345 | ||
3347 | if (max_load - this_load + 2*busiest_load_per_task >= | 3346 | if (max_load - this_load + busiest_load_per_task >= |
3348 | busiest_load_per_task * imbn) { | 3347 | busiest_load_per_task * imbn) { |
3349 | *imbalance = busiest_load_per_task; | 3348 | *imbalance = busiest_load_per_task; |
3350 | return busiest; | 3349 | return busiest; |
@@ -3695,7 +3694,7 @@ out_balanced: | |||
3695 | static void idle_balance(int this_cpu, struct rq *this_rq) | 3694 | static void idle_balance(int this_cpu, struct rq *this_rq) |
3696 | { | 3695 | { |
3697 | struct sched_domain *sd; | 3696 | struct sched_domain *sd; |
3698 | int pulled_task = -1; | 3697 | int pulled_task = 0; |
3699 | unsigned long next_balance = jiffies + HZ; | 3698 | unsigned long next_balance = jiffies + HZ; |
3700 | cpumask_t tmpmask; | 3699 | cpumask_t tmpmask; |
3701 | 3700 | ||
@@ -4052,23 +4051,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); | |||
4052 | EXPORT_PER_CPU_SYMBOL(kstat); | 4051 | EXPORT_PER_CPU_SYMBOL(kstat); |
4053 | 4052 | ||
4054 | /* | 4053 | /* |
4055 | * Return p->sum_exec_runtime plus any more ns on the sched_clock | 4054 | * Return any ns on the sched_clock that have not yet been banked in |
4056 | * that have not yet been banked in case the task is currently running. | 4055 | * @p in case that task is currently running. |
4057 | */ | 4056 | */ |
4058 | unsigned long long task_sched_runtime(struct task_struct *p) | 4057 | unsigned long long task_delta_exec(struct task_struct *p) |
4059 | { | 4058 | { |
4060 | unsigned long flags; | 4059 | unsigned long flags; |
4061 | u64 ns, delta_exec; | ||
4062 | struct rq *rq; | 4060 | struct rq *rq; |
4061 | u64 ns = 0; | ||
4063 | 4062 | ||
4064 | rq = task_rq_lock(p, &flags); | 4063 | rq = task_rq_lock(p, &flags); |
4065 | ns = p->se.sum_exec_runtime; | 4064 | |
4066 | if (task_current(rq, p)) { | 4065 | if (task_current(rq, p)) { |
4066 | u64 delta_exec; | ||
4067 | |||
4067 | update_rq_clock(rq); | 4068 | update_rq_clock(rq); |
4068 | delta_exec = rq->clock - p->se.exec_start; | 4069 | delta_exec = rq->clock - p->se.exec_start; |
4069 | if ((s64)delta_exec > 0) | 4070 | if ((s64)delta_exec > 0) |
4070 | ns += delta_exec; | 4071 | ns = delta_exec; |
4071 | } | 4072 | } |
4073 | |||
4072 | task_rq_unlock(rq, &flags); | 4074 | task_rq_unlock(rq, &flags); |
4073 | 4075 | ||
4074 | return ns; | 4076 | return ns; |
@@ -4085,6 +4087,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
4085 | cputime64_t tmp; | 4087 | cputime64_t tmp; |
4086 | 4088 | ||
4087 | p->utime = cputime_add(p->utime, cputime); | 4089 | p->utime = cputime_add(p->utime, cputime); |
4090 | account_group_user_time(p, cputime); | ||
4088 | 4091 | ||
4089 | /* Add user time to cpustat. */ | 4092 | /* Add user time to cpustat. */ |
4090 | tmp = cputime_to_cputime64(cputime); | 4093 | tmp = cputime_to_cputime64(cputime); |
@@ -4109,6 +4112,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
4109 | tmp = cputime_to_cputime64(cputime); | 4112 | tmp = cputime_to_cputime64(cputime); |
4110 | 4113 | ||
4111 | p->utime = cputime_add(p->utime, cputime); | 4114 | p->utime = cputime_add(p->utime, cputime); |
4115 | account_group_user_time(p, cputime); | ||
4112 | p->gtime = cputime_add(p->gtime, cputime); | 4116 | p->gtime = cputime_add(p->gtime, cputime); |
4113 | 4117 | ||
4114 | cpustat->user = cputime64_add(cpustat->user, tmp); | 4118 | cpustat->user = cputime64_add(cpustat->user, tmp); |
@@ -4144,6 +4148,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
4144 | } | 4148 | } |
4145 | 4149 | ||
4146 | p->stime = cputime_add(p->stime, cputime); | 4150 | p->stime = cputime_add(p->stime, cputime); |
4151 | account_group_system_time(p, cputime); | ||
4147 | 4152 | ||
4148 | /* Add system time to cpustat. */ | 4153 | /* Add system time to cpustat. */ |
4149 | tmp = cputime_to_cputime64(cputime); | 4154 | tmp = cputime_to_cputime64(cputime); |
@@ -4320,7 +4325,7 @@ void __kprobes sub_preempt_count(int val) | |||
4320 | /* | 4325 | /* |
4321 | * Underflow? | 4326 | * Underflow? |
4322 | */ | 4327 | */ |
4323 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) | 4328 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) |
4324 | return; | 4329 | return; |
4325 | /* | 4330 | /* |
4326 | * Is the spinlock portion underflowing? | 4331 | * Is the spinlock portion underflowing? |
@@ -4441,12 +4446,8 @@ need_resched_nonpreemptible: | |||
4441 | if (sched_feat(HRTICK)) | 4446 | if (sched_feat(HRTICK)) |
4442 | hrtick_clear(rq); | 4447 | hrtick_clear(rq); |
4443 | 4448 | ||
4444 | /* | 4449 | spin_lock_irq(&rq->lock); |
4445 | * Do the rq-clock update outside the rq lock: | ||
4446 | */ | ||
4447 | local_irq_disable(); | ||
4448 | update_rq_clock(rq); | 4450 | update_rq_clock(rq); |
4449 | spin_lock(&rq->lock); | ||
4450 | clear_tsk_need_resched(prev); | 4451 | clear_tsk_need_resched(prev); |
4451 | 4452 | ||
4452 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4453 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -5119,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
5119 | set_load_weight(p); | 5120 | set_load_weight(p); |
5120 | } | 5121 | } |
5121 | 5122 | ||
5123 | /* | ||
5124 | * check the target process has a UID that matches the current process's | ||
5125 | */ | ||
5126 | static bool check_same_owner(struct task_struct *p) | ||
5127 | { | ||
5128 | const struct cred *cred = current_cred(), *pcred; | ||
5129 | bool match; | ||
5130 | |||
5131 | rcu_read_lock(); | ||
5132 | pcred = __task_cred(p); | ||
5133 | match = (cred->euid == pcred->euid || | ||
5134 | cred->euid == pcred->uid); | ||
5135 | rcu_read_unlock(); | ||
5136 | return match; | ||
5137 | } | ||
5138 | |||
5122 | static int __sched_setscheduler(struct task_struct *p, int policy, | 5139 | static int __sched_setscheduler(struct task_struct *p, int policy, |
5123 | struct sched_param *param, bool user) | 5140 | struct sched_param *param, bool user) |
5124 | { | 5141 | { |
@@ -5178,8 +5195,7 @@ recheck: | |||
5178 | return -EPERM; | 5195 | return -EPERM; |
5179 | 5196 | ||
5180 | /* can't change other user's priorities */ | 5197 | /* can't change other user's priorities */ |
5181 | if ((current->euid != p->euid) && | 5198 | if (!check_same_owner(p)) |
5182 | (current->euid != p->uid)) | ||
5183 | return -EPERM; | 5199 | return -EPERM; |
5184 | } | 5200 | } |
5185 | 5201 | ||
@@ -5411,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) | |||
5411 | read_unlock(&tasklist_lock); | 5427 | read_unlock(&tasklist_lock); |
5412 | 5428 | ||
5413 | retval = -EPERM; | 5429 | retval = -EPERM; |
5414 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 5430 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
5415 | !capable(CAP_SYS_NICE)) | ||
5416 | goto out_unlock; | 5431 | goto out_unlock; |
5417 | 5432 | ||
5418 | retval = security_task_setscheduler(p, 0, NULL); | 5433 | retval = security_task_setscheduler(p, 0, NULL); |
@@ -5851,6 +5866,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5851 | struct rq *rq = cpu_rq(cpu); | 5866 | struct rq *rq = cpu_rq(cpu); |
5852 | unsigned long flags; | 5867 | unsigned long flags; |
5853 | 5868 | ||
5869 | spin_lock_irqsave(&rq->lock, flags); | ||
5870 | |||
5854 | __sched_fork(idle); | 5871 | __sched_fork(idle); |
5855 | idle->se.exec_start = sched_clock(); | 5872 | idle->se.exec_start = sched_clock(); |
5856 | 5873 | ||
@@ -5858,7 +5875,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5858 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 5875 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
5859 | __set_task_cpu(idle, cpu); | 5876 | __set_task_cpu(idle, cpu); |
5860 | 5877 | ||
5861 | spin_lock_irqsave(&rq->lock, flags); | ||
5862 | rq->curr = rq->idle = idle; | 5878 | rq->curr = rq->idle = idle; |
5863 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5879 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
5864 | idle->oncpu = 1; | 5880 | idle->oncpu = 1; |
@@ -5875,6 +5891,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5875 | * The idle tasks have their own, simple scheduling class: | 5891 | * The idle tasks have their own, simple scheduling class: |
5876 | */ | 5892 | */ |
5877 | idle->sched_class = &idle_sched_class; | 5893 | idle->sched_class = &idle_sched_class; |
5894 | ftrace_graph_init_task(idle); | ||
5878 | } | 5895 | } |
5879 | 5896 | ||
5880 | /* | 5897 | /* |
@@ -6105,7 +6122,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6105 | 6122 | ||
6106 | /* | 6123 | /* |
6107 | * Figure out where task on dead CPU should go, use force if necessary. | 6124 | * Figure out where task on dead CPU should go, use force if necessary. |
6108 | * NOTE: interrupts should be disabled by the caller | ||
6109 | */ | 6125 | */ |
6110 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 6126 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
6111 | { | 6127 | { |
@@ -6566,7 +6582,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6566 | req = list_entry(rq->migration_queue.next, | 6582 | req = list_entry(rq->migration_queue.next, |
6567 | struct migration_req, list); | 6583 | struct migration_req, list); |
6568 | list_del_init(&req->list); | 6584 | list_del_init(&req->list); |
6585 | spin_unlock_irq(&rq->lock); | ||
6569 | complete(&req->done); | 6586 | complete(&req->done); |
6587 | spin_lock_irq(&rq->lock); | ||
6570 | } | 6588 | } |
6571 | spin_unlock_irq(&rq->lock); | 6589 | spin_unlock_irq(&rq->lock); |
6572 | break; | 6590 | break; |
@@ -6615,28 +6633,6 @@ early_initcall(migration_init); | |||
6615 | 6633 | ||
6616 | #ifdef CONFIG_SCHED_DEBUG | 6634 | #ifdef CONFIG_SCHED_DEBUG |
6617 | 6635 | ||
6618 | static inline const char *sd_level_to_string(enum sched_domain_level lvl) | ||
6619 | { | ||
6620 | switch (lvl) { | ||
6621 | case SD_LV_NONE: | ||
6622 | return "NONE"; | ||
6623 | case SD_LV_SIBLING: | ||
6624 | return "SIBLING"; | ||
6625 | case SD_LV_MC: | ||
6626 | return "MC"; | ||
6627 | case SD_LV_CPU: | ||
6628 | return "CPU"; | ||
6629 | case SD_LV_NODE: | ||
6630 | return "NODE"; | ||
6631 | case SD_LV_ALLNODES: | ||
6632 | return "ALLNODES"; | ||
6633 | case SD_LV_MAX: | ||
6634 | return "MAX"; | ||
6635 | |||
6636 | } | ||
6637 | return "MAX"; | ||
6638 | } | ||
6639 | |||
6640 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6636 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6641 | cpumask_t *groupmask) | 6637 | cpumask_t *groupmask) |
6642 | { | 6638 | { |
@@ -6656,8 +6652,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6656 | return -1; | 6652 | return -1; |
6657 | } | 6653 | } |
6658 | 6654 | ||
6659 | printk(KERN_CONT "span %s level %s\n", | 6655 | printk(KERN_CONT "span %s level %s\n", str, sd->name); |
6660 | str, sd_level_to_string(sd->level)); | ||
6661 | 6656 | ||
6662 | if (!cpu_isset(cpu, sd->span)) { | 6657 | if (!cpu_isset(cpu, sd->span)) { |
6663 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6658 | printk(KERN_ERR "ERROR: domain->span does not contain " |
@@ -6793,6 +6788,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6793 | SD_BALANCE_EXEC | | 6788 | SD_BALANCE_EXEC | |
6794 | SD_SHARE_CPUPOWER | | 6789 | SD_SHARE_CPUPOWER | |
6795 | SD_SHARE_PKG_RESOURCES); | 6790 | SD_SHARE_PKG_RESOURCES); |
6791 | if (nr_node_ids == 1) | ||
6792 | pflags &= ~SD_SERIALIZE; | ||
6796 | } | 6793 | } |
6797 | if (~cflags & pflags) | 6794 | if (~cflags & pflags) |
6798 | return 0; | 6795 | return 0; |
@@ -6868,15 +6865,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6868 | struct sched_domain *tmp; | 6865 | struct sched_domain *tmp; |
6869 | 6866 | ||
6870 | /* Remove the sched domains which do not contribute to scheduling. */ | 6867 | /* Remove the sched domains which do not contribute to scheduling. */ |
6871 | for (tmp = sd; tmp; tmp = tmp->parent) { | 6868 | for (tmp = sd; tmp; ) { |
6872 | struct sched_domain *parent = tmp->parent; | 6869 | struct sched_domain *parent = tmp->parent; |
6873 | if (!parent) | 6870 | if (!parent) |
6874 | break; | 6871 | break; |
6872 | |||
6875 | if (sd_parent_degenerate(tmp, parent)) { | 6873 | if (sd_parent_degenerate(tmp, parent)) { |
6876 | tmp->parent = parent->parent; | 6874 | tmp->parent = parent->parent; |
6877 | if (parent->parent) | 6875 | if (parent->parent) |
6878 | parent->parent->child = tmp; | 6876 | parent->parent->child = tmp; |
6879 | } | 6877 | } else |
6878 | tmp = tmp->parent; | ||
6880 | } | 6879 | } |
6881 | 6880 | ||
6882 | if (sd && sd_degenerate(sd)) { | 6881 | if (sd && sd_degenerate(sd)) { |
@@ -7311,13 +7310,21 @@ struct allmasks { | |||
7311 | }; | 7310 | }; |
7312 | 7311 | ||
7313 | #if NR_CPUS > 128 | 7312 | #if NR_CPUS > 128 |
7314 | #define SCHED_CPUMASK_ALLOC 1 | 7313 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v |
7315 | #define SCHED_CPUMASK_FREE(v) kfree(v) | 7314 | static inline void sched_cpumask_alloc(struct allmasks **masks) |
7316 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v | 7315 | { |
7316 | *masks = kmalloc(sizeof(**masks), GFP_KERNEL); | ||
7317 | } | ||
7318 | static inline void sched_cpumask_free(struct allmasks *masks) | ||
7319 | { | ||
7320 | kfree(masks); | ||
7321 | } | ||
7317 | #else | 7322 | #else |
7318 | #define SCHED_CPUMASK_ALLOC 0 | 7323 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v |
7319 | #define SCHED_CPUMASK_FREE(v) | 7324 | static inline void sched_cpumask_alloc(struct allmasks **masks) |
7320 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v | 7325 | { } |
7326 | static inline void sched_cpumask_free(struct allmasks *masks) | ||
7327 | { } | ||
7321 | #endif | 7328 | #endif |
7322 | 7329 | ||
7323 | #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ | 7330 | #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ |
@@ -7393,9 +7400,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7393 | return -ENOMEM; | 7400 | return -ENOMEM; |
7394 | } | 7401 | } |
7395 | 7402 | ||
7396 | #if SCHED_CPUMASK_ALLOC | ||
7397 | /* get space for all scratch cpumask variables */ | 7403 | /* get space for all scratch cpumask variables */ |
7398 | allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); | 7404 | sched_cpumask_alloc(&allmasks); |
7399 | if (!allmasks) { | 7405 | if (!allmasks) { |
7400 | printk(KERN_WARNING "Cannot alloc cpumask array\n"); | 7406 | printk(KERN_WARNING "Cannot alloc cpumask array\n"); |
7401 | kfree(rd); | 7407 | kfree(rd); |
@@ -7404,7 +7410,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7404 | #endif | 7410 | #endif |
7405 | return -ENOMEM; | 7411 | return -ENOMEM; |
7406 | } | 7412 | } |
7407 | #endif | 7413 | |
7408 | tmpmask = (cpumask_t *)allmasks; | 7414 | tmpmask = (cpumask_t *)allmasks; |
7409 | 7415 | ||
7410 | 7416 | ||
@@ -7658,13 +7664,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7658 | cpu_attach_domain(sd, rd, i); | 7664 | cpu_attach_domain(sd, rd, i); |
7659 | } | 7665 | } |
7660 | 7666 | ||
7661 | SCHED_CPUMASK_FREE((void *)allmasks); | 7667 | sched_cpumask_free(allmasks); |
7662 | return 0; | 7668 | return 0; |
7663 | 7669 | ||
7664 | #ifdef CONFIG_NUMA | 7670 | #ifdef CONFIG_NUMA |
7665 | error: | 7671 | error: |
7666 | free_sched_groups(cpu_map, tmpmask); | 7672 | free_sched_groups(cpu_map, tmpmask); |
7667 | SCHED_CPUMASK_FREE((void *)allmasks); | 7673 | sched_cpumask_free(allmasks); |
7674 | kfree(rd); | ||
7668 | return -ENOMEM; | 7675 | return -ENOMEM; |
7669 | #endif | 7676 | #endif |
7670 | } | 7677 | } |
@@ -7686,8 +7693,14 @@ static struct sched_domain_attr *dattr_cur; | |||
7686 | */ | 7693 | */ |
7687 | static cpumask_t fallback_doms; | 7694 | static cpumask_t fallback_doms; |
7688 | 7695 | ||
7689 | void __attribute__((weak)) arch_update_cpu_topology(void) | 7696 | /* |
7697 | * arch_update_cpu_topology lets virtualized architectures update the | ||
7698 | * cpu core maps. It is supposed to return 1 if the topology changed | ||
7699 | * or 0 if it stayed the same. | ||
7700 | */ | ||
7701 | int __attribute__((weak)) arch_update_cpu_topology(void) | ||
7690 | { | 7702 | { |
7703 | return 0; | ||
7691 | } | 7704 | } |
7692 | 7705 | ||
7693 | /* | 7706 | /* |
@@ -7727,8 +7740,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
7727 | cpumask_t tmpmask; | 7740 | cpumask_t tmpmask; |
7728 | int i; | 7741 | int i; |
7729 | 7742 | ||
7730 | unregister_sched_domain_sysctl(); | ||
7731 | |||
7732 | for_each_cpu_mask_nr(i, *cpu_map) | 7743 | for_each_cpu_mask_nr(i, *cpu_map) |
7733 | cpu_attach_domain(NULL, &def_root_domain, i); | 7744 | cpu_attach_domain(NULL, &def_root_domain, i); |
7734 | synchronize_sched(); | 7745 | synchronize_sched(); |
@@ -7766,13 +7777,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
7766 | * | 7777 | * |
7767 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 7778 | * The passed in 'doms_new' should be kmalloc'd. This routine takes |
7768 | * ownership of it and will kfree it when done with it. If the caller | 7779 | * ownership of it and will kfree it when done with it. If the caller |
7769 | * failed the kmalloc call, then it can pass in doms_new == NULL, | 7780 | * failed the kmalloc call, then it can pass in doms_new == NULL && |
7770 | * and partition_sched_domains() will fallback to the single partition | 7781 | * ndoms_new == 1, and partition_sched_domains() will fallback to |
7771 | * 'fallback_doms', it also forces the domains to be rebuilt. | 7782 | * the single partition 'fallback_doms', it also forces the domains |
7783 | * to be rebuilt. | ||
7772 | * | 7784 | * |
7773 | * If doms_new==NULL it will be replaced with cpu_online_map. | 7785 | * If doms_new == NULL it will be replaced with cpu_online_map. |
7774 | * ndoms_new==0 is a special case for destroying existing domains. | 7786 | * ndoms_new == 0 is a special case for destroying existing domains, |
7775 | * It will not create the default domain. | 7787 | * and it will not create the default domain. |
7776 | * | 7788 | * |
7777 | * Call with hotplug lock held | 7789 | * Call with hotplug lock held |
7778 | */ | 7790 | */ |
@@ -7780,17 +7792,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, | |||
7780 | struct sched_domain_attr *dattr_new) | 7792 | struct sched_domain_attr *dattr_new) |
7781 | { | 7793 | { |
7782 | int i, j, n; | 7794 | int i, j, n; |
7795 | int new_topology; | ||
7783 | 7796 | ||
7784 | mutex_lock(&sched_domains_mutex); | 7797 | mutex_lock(&sched_domains_mutex); |
7785 | 7798 | ||
7786 | /* always unregister in case we don't destroy any domains */ | 7799 | /* always unregister in case we don't destroy any domains */ |
7787 | unregister_sched_domain_sysctl(); | 7800 | unregister_sched_domain_sysctl(); |
7788 | 7801 | ||
7802 | /* Let architecture update cpu core mappings. */ | ||
7803 | new_topology = arch_update_cpu_topology(); | ||
7804 | |||
7789 | n = doms_new ? ndoms_new : 0; | 7805 | n = doms_new ? ndoms_new : 0; |
7790 | 7806 | ||
7791 | /* Destroy deleted domains */ | 7807 | /* Destroy deleted domains */ |
7792 | for (i = 0; i < ndoms_cur; i++) { | 7808 | for (i = 0; i < ndoms_cur; i++) { |
7793 | for (j = 0; j < n; j++) { | 7809 | for (j = 0; j < n && !new_topology; j++) { |
7794 | if (cpus_equal(doms_cur[i], doms_new[j]) | 7810 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7795 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7811 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
7796 | goto match1; | 7812 | goto match1; |
@@ -7805,12 +7821,12 @@ match1: | |||
7805 | ndoms_cur = 0; | 7821 | ndoms_cur = 0; |
7806 | doms_new = &fallback_doms; | 7822 | doms_new = &fallback_doms; |
7807 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7823 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
7808 | dattr_new = NULL; | 7824 | WARN_ON_ONCE(dattr_new); |
7809 | } | 7825 | } |
7810 | 7826 | ||
7811 | /* Build new domains */ | 7827 | /* Build new domains */ |
7812 | for (i = 0; i < ndoms_new; i++) { | 7828 | for (i = 0; i < ndoms_new; i++) { |
7813 | for (j = 0; j < ndoms_cur; j++) { | 7829 | for (j = 0; j < ndoms_cur && !new_topology; j++) { |
7814 | if (cpus_equal(doms_new[i], doms_cur[j]) | 7830 | if (cpus_equal(doms_new[i], doms_cur[j]) |
7815 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 7831 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
7816 | goto match2; | 7832 | goto match2; |
@@ -8465,7 +8481,7 @@ static | |||
8465 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | 8481 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) |
8466 | { | 8482 | { |
8467 | struct cfs_rq *cfs_rq; | 8483 | struct cfs_rq *cfs_rq; |
8468 | struct sched_entity *se, *parent_se; | 8484 | struct sched_entity *se; |
8469 | struct rq *rq; | 8485 | struct rq *rq; |
8470 | int i; | 8486 | int i; |
8471 | 8487 | ||
@@ -8481,18 +8497,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8481 | for_each_possible_cpu(i) { | 8497 | for_each_possible_cpu(i) { |
8482 | rq = cpu_rq(i); | 8498 | rq = cpu_rq(i); |
8483 | 8499 | ||
8484 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), | 8500 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8485 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 8501 | GFP_KERNEL, cpu_to_node(i)); |
8486 | if (!cfs_rq) | 8502 | if (!cfs_rq) |
8487 | goto err; | 8503 | goto err; |
8488 | 8504 | ||
8489 | se = kmalloc_node(sizeof(struct sched_entity), | 8505 | se = kzalloc_node(sizeof(struct sched_entity), |
8490 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 8506 | GFP_KERNEL, cpu_to_node(i)); |
8491 | if (!se) | 8507 | if (!se) |
8492 | goto err; | 8508 | goto err; |
8493 | 8509 | ||
8494 | parent_se = parent ? parent->se[i] : NULL; | 8510 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); |
8495 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); | ||
8496 | } | 8511 | } |
8497 | 8512 | ||
8498 | return 1; | 8513 | return 1; |
@@ -8553,7 +8568,7 @@ static | |||
8553 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | 8568 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) |
8554 | { | 8569 | { |
8555 | struct rt_rq *rt_rq; | 8570 | struct rt_rq *rt_rq; |
8556 | struct sched_rt_entity *rt_se, *parent_se; | 8571 | struct sched_rt_entity *rt_se; |
8557 | struct rq *rq; | 8572 | struct rq *rq; |
8558 | int i; | 8573 | int i; |
8559 | 8574 | ||
@@ -8570,18 +8585,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8570 | for_each_possible_cpu(i) { | 8585 | for_each_possible_cpu(i) { |
8571 | rq = cpu_rq(i); | 8586 | rq = cpu_rq(i); |
8572 | 8587 | ||
8573 | rt_rq = kmalloc_node(sizeof(struct rt_rq), | 8588 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8574 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 8589 | GFP_KERNEL, cpu_to_node(i)); |
8575 | if (!rt_rq) | 8590 | if (!rt_rq) |
8576 | goto err; | 8591 | goto err; |
8577 | 8592 | ||
8578 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), | 8593 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), |
8579 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 8594 | GFP_KERNEL, cpu_to_node(i)); |
8580 | if (!rt_se) | 8595 | if (!rt_se) |
8581 | goto err; | 8596 | goto err; |
8582 | 8597 | ||
8583 | parent_se = parent ? parent->rt_se[i] : NULL; | 8598 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); |
8584 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); | ||
8585 | } | 8599 | } |
8586 | 8600 | ||
8587 | return 1; | 8601 | return 1; |
@@ -9224,11 +9238,12 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9224 | * (balbir@in.ibm.com). | 9238 | * (balbir@in.ibm.com). |
9225 | */ | 9239 | */ |
9226 | 9240 | ||
9227 | /* track cpu usage of a group of tasks */ | 9241 | /* track cpu usage of a group of tasks and its child groups */ |
9228 | struct cpuacct { | 9242 | struct cpuacct { |
9229 | struct cgroup_subsys_state css; | 9243 | struct cgroup_subsys_state css; |
9230 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 9244 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
9231 | u64 *cpuusage; | 9245 | u64 *cpuusage; |
9246 | struct cpuacct *parent; | ||
9232 | }; | 9247 | }; |
9233 | 9248 | ||
9234 | struct cgroup_subsys cpuacct_subsys; | 9249 | struct cgroup_subsys cpuacct_subsys; |
@@ -9262,6 +9277,9 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
9262 | return ERR_PTR(-ENOMEM); | 9277 | return ERR_PTR(-ENOMEM); |
9263 | } | 9278 | } |
9264 | 9279 | ||
9280 | if (cgrp->parent) | ||
9281 | ca->parent = cgroup_ca(cgrp->parent); | ||
9282 | |||
9265 | return &ca->css; | 9283 | return &ca->css; |
9266 | } | 9284 | } |
9267 | 9285 | ||
@@ -9275,6 +9293,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9275 | kfree(ca); | 9293 | kfree(ca); |
9276 | } | 9294 | } |
9277 | 9295 | ||
9296 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
9297 | { | ||
9298 | u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); | ||
9299 | u64 data; | ||
9300 | |||
9301 | #ifndef CONFIG_64BIT | ||
9302 | /* | ||
9303 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
9304 | */ | ||
9305 | spin_lock_irq(&cpu_rq(cpu)->lock); | ||
9306 | data = *cpuusage; | ||
9307 | spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
9308 | #else | ||
9309 | data = *cpuusage; | ||
9310 | #endif | ||
9311 | |||
9312 | return data; | ||
9313 | } | ||
9314 | |||
9315 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
9316 | { | ||
9317 | u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); | ||
9318 | |||
9319 | #ifndef CONFIG_64BIT | ||
9320 | /* | ||
9321 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
9322 | */ | ||
9323 | spin_lock_irq(&cpu_rq(cpu)->lock); | ||
9324 | *cpuusage = val; | ||
9325 | spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
9326 | #else | ||
9327 | *cpuusage = val; | ||
9328 | #endif | ||
9329 | } | ||
9330 | |||
9278 | /* return total cpu usage (in nanoseconds) of a group */ | 9331 | /* return total cpu usage (in nanoseconds) of a group */ |
9279 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | 9332 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) |
9280 | { | 9333 | { |
@@ -9282,17 +9335,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | |||
9282 | u64 totalcpuusage = 0; | 9335 | u64 totalcpuusage = 0; |
9283 | int i; | 9336 | int i; |
9284 | 9337 | ||
9285 | for_each_possible_cpu(i) { | 9338 | for_each_present_cpu(i) |
9286 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | 9339 | totalcpuusage += cpuacct_cpuusage_read(ca, i); |
9287 | |||
9288 | /* | ||
9289 | * Take rq->lock to make 64-bit addition safe on 32-bit | ||
9290 | * platforms. | ||
9291 | */ | ||
9292 | spin_lock_irq(&cpu_rq(i)->lock); | ||
9293 | totalcpuusage += *cpuusage; | ||
9294 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
9295 | } | ||
9296 | 9340 | ||
9297 | return totalcpuusage; | 9341 | return totalcpuusage; |
9298 | } | 9342 | } |
@@ -9309,23 +9353,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | |||
9309 | goto out; | 9353 | goto out; |
9310 | } | 9354 | } |
9311 | 9355 | ||
9312 | for_each_possible_cpu(i) { | 9356 | for_each_present_cpu(i) |
9313 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | 9357 | cpuacct_cpuusage_write(ca, i, 0); |
9314 | 9358 | ||
9315 | spin_lock_irq(&cpu_rq(i)->lock); | ||
9316 | *cpuusage = 0; | ||
9317 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
9318 | } | ||
9319 | out: | 9359 | out: |
9320 | return err; | 9360 | return err; |
9321 | } | 9361 | } |
9322 | 9362 | ||
9363 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
9364 | struct seq_file *m) | ||
9365 | { | ||
9366 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
9367 | u64 percpu; | ||
9368 | int i; | ||
9369 | |||
9370 | for_each_present_cpu(i) { | ||
9371 | percpu = cpuacct_cpuusage_read(ca, i); | ||
9372 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
9373 | } | ||
9374 | seq_printf(m, "\n"); | ||
9375 | return 0; | ||
9376 | } | ||
9377 | |||
9323 | static struct cftype files[] = { | 9378 | static struct cftype files[] = { |
9324 | { | 9379 | { |
9325 | .name = "usage", | 9380 | .name = "usage", |
9326 | .read_u64 = cpuusage_read, | 9381 | .read_u64 = cpuusage_read, |
9327 | .write_u64 = cpuusage_write, | 9382 | .write_u64 = cpuusage_write, |
9328 | }, | 9383 | }, |
9384 | { | ||
9385 | .name = "usage_percpu", | ||
9386 | .read_seq_string = cpuacct_percpu_seq_read, | ||
9387 | }, | ||
9388 | |||
9329 | }; | 9389 | }; |
9330 | 9390 | ||
9331 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9391 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
@@ -9341,14 +9401,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9341 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 9401 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
9342 | { | 9402 | { |
9343 | struct cpuacct *ca; | 9403 | struct cpuacct *ca; |
9404 | int cpu; | ||
9344 | 9405 | ||
9345 | if (!cpuacct_subsys.active) | 9406 | if (!cpuacct_subsys.active) |
9346 | return; | 9407 | return; |
9347 | 9408 | ||
9409 | cpu = task_cpu(tsk); | ||
9348 | ca = task_ca(tsk); | 9410 | ca = task_ca(tsk); |
9349 | if (ca) { | ||
9350 | u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); | ||
9351 | 9411 | ||
9412 | for (; ca; ca = ca->parent) { | ||
9413 | u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); | ||
9352 | *cpuusage += cputime; | 9414 | *cpuusage += cputime; |
9353 | } | 9415 | } |
9354 | } | 9416 | } |