aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c544
1 files changed, 386 insertions, 158 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 04160d277e7a..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
205} 210}
206 211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 305#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 306#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 307#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 308#endif /* CONFIG_USER_SCHED */
304 309
305/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -600,14 +605,13 @@ struct rq {
600 /* BKL stats */ 605 /* BKL stats */
601 unsigned int bkl_count; 606 unsigned int bkl_count;
602#endif 607#endif
603 struct lock_class_key rq_lock_key;
604}; 608};
605 609
606static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
607 611
608static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
609{ 613{
610 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
611} 615}
612 616
613static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -809,9 +813,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
809 813
810/* 814/*
811 * ratelimit for updating the group shares. 815 * ratelimit for updating the group shares.
812 * default: 0.5ms 816 * default: 0.25ms
813 */ 817 */
814const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; 818unsigned int sysctl_sched_shares_ratelimit = 250000;
815 819
816/* 820/*
817 * period over which we measure -rt task cpu usage in us. 821 * period over which we measure -rt task cpu usage in us.
@@ -834,7 +838,7 @@ static inline u64 global_rt_period(void)
834 838
835static inline u64 global_rt_runtime(void) 839static inline u64 global_rt_runtime(void)
836{ 840{
837 if (sysctl_sched_rt_period < 0) 841 if (sysctl_sched_rt_runtime < 0)
838 return RUNTIME_INF; 842 return RUNTIME_INF;
839 843
840 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 844 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -1088,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1088 return NOTIFY_DONE; 1092 return NOTIFY_DONE;
1089} 1093}
1090 1094
1091static void init_hrtick(void) 1095static __init void init_hrtick(void)
1092{ 1096{
1093 hotcpu_notifier(hotplug_hrtick, 0); 1097 hotcpu_notifier(hotplug_hrtick, 0);
1094} 1098}
@@ -1103,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1103 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1104} 1108}
1105 1109
1106static void init_hrtick(void) 1110static inline void init_hrtick(void)
1107{ 1111{
1108} 1112}
1109#endif /* CONFIG_SMP */ 1113#endif /* CONFIG_SMP */
@@ -1120,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
1120 1124
1121 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1122 rq->hrtick_timer.function = hrtick; 1126 rq->hrtick_timer.function = hrtick;
1123 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1124} 1128}
1125#else 1129#else /* CONFIG_SCHED_HRTICK */
1126static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1127{ 1131{
1128} 1132}
@@ -1134,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1134static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1135{ 1139{
1136} 1140}
1137#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1138 1142
1139/* 1143/*
1140 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1381,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1381 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1382} 1386}
1383 1387
1384#ifdef CONFIG_SMP 1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1385static unsigned long source_load(int cpu, int type); 1389typedef int (*tg_visitor)(struct task_group *, void *);
1386static unsigned long target_load(int cpu, int type);
1387static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1388
1389static unsigned long cpu_avg_load_per_task(int cpu)
1390{
1391 struct rq *rq = cpu_rq(cpu);
1392
1393 if (rq->nr_running)
1394 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1395
1396 return rq->avg_load_per_task;
1397}
1398
1399#ifdef CONFIG_FAIR_GROUP_SCHED
1400
1401typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1402 1390
1403/* 1391/*
1404 * Iterate the full tree, calling @down when first entering a node and @up when 1392 * Iterate the full tree, calling @down when first entering a node and @up when
1405 * leaving it for the final time. 1393 * leaving it for the final time.
1406 */ 1394 */
1407static void 1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1408walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1409{ 1396{
1410 struct task_group *parent, *child; 1397 struct task_group *parent, *child;
1398 int ret;
1411 1399
1412 rcu_read_lock(); 1400 rcu_read_lock();
1413 parent = &root_task_group; 1401 parent = &root_task_group;
1414down: 1402down:
1415 (*down)(parent, cpu, sd); 1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1416 list_for_each_entry_rcu(child, &parent->children, siblings) { 1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1417 parent = child; 1407 parent = child;
1418 goto down; 1408 goto down;
@@ -1420,14 +1410,42 @@ down:
1420up: 1410up:
1421 continue; 1411 continue;
1422 } 1412 }
1423 (*up)(parent, cpu, sd); 1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1424 1416
1425 child = parent; 1417 child = parent;
1426 parent = parent->parent; 1418 parent = parent->parent;
1427 if (parent) 1419 if (parent)
1428 goto up; 1420 goto up;
1421out_unlock:
1429 rcu_read_unlock(); 1422 rcu_read_unlock();
1423
1424 return ret;
1425}
1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1430} 1430}
1431#endif
1432
1433#ifdef CONFIG_SMP
1434static unsigned long source_load(int cpu, int type);
1435static unsigned long target_load(int cpu, int type);
1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1446}
1447
1448#ifdef CONFIG_FAIR_GROUP_SCHED
1431 1449
1432static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1433 1451
@@ -1487,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1487 * This needs to be done in a bottom-up fashion because the rq weight of a 1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1488 * parent group depends on the shares of its child groups. 1506 * parent group depends on the shares of its child groups.
1489 */ 1507 */
1490static void 1508static int tg_shares_up(struct task_group *tg, void *data)
1491tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1492{ 1509{
1493 unsigned long rq_weight = 0; 1510 unsigned long rq_weight = 0;
1494 unsigned long shares = 0; 1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1495 int i; 1513 int i;
1496 1514
1497 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu_mask(i, sd->span) {
@@ -1516,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1516 __update_group_shares_cpu(tg, i, shares, rq_weight); 1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1517 spin_unlock_irqrestore(&rq->lock, flags); 1535 spin_unlock_irqrestore(&rq->lock, flags);
1518 } 1536 }
1537
1538 return 0;
1519} 1539}
1520 1540
1521/* 1541/*
@@ -1523,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1523 * This needs to be done in a top-down fashion because the load of a child 1543 * This needs to be done in a top-down fashion because the load of a child
1524 * group is a fraction of its parents load. 1544 * group is a fraction of its parents load.
1525 */ 1545 */
1526static void 1546static int tg_load_down(struct task_group *tg, void *data)
1527tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1528{ 1547{
1529 unsigned long load; 1548 unsigned long load;
1549 long cpu = (long)data;
1530 1550
1531 if (!tg->parent) { 1551 if (!tg->parent) {
1532 load = cpu_rq(cpu)->load.weight; 1552 load = cpu_rq(cpu)->load.weight;
@@ -1537,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1537 } 1557 }
1538 1558
1539 tg->cfs_rq[cpu]->h_load = load; 1559 tg->cfs_rq[cpu]->h_load = load;
1540}
1541 1560
1542static void 1561 return 0;
1543tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1544{
1545} 1562}
1546 1563
1547static void update_shares(struct sched_domain *sd) 1564static void update_shares(struct sched_domain *sd)
@@ -1551,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
1551 1568
1552 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1553 sd->last_update = now; 1570 sd->last_update = now;
1554 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1555 } 1572 }
1556} 1573}
1557 1574
@@ -1562,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1562 spin_lock(&rq->lock); 1579 spin_lock(&rq->lock);
1563} 1580}
1564 1581
1565static void update_h_load(int cpu) 1582static void update_h_load(long cpu)
1566{ 1583{
1567 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1568} 1585}
1569 1586
1570#else 1587#else
@@ -1922,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1922 running = task_running(rq, p); 1939 running = task_running(rq, p);
1923 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1924 ncsw = 0; 1941 ncsw = 0;
1925 if (!match_state || p->state == match_state) { 1942 if (!match_state || p->state == match_state)
1926 ncsw = p->nivcsw + p->nvcsw; 1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1927 if (unlikely(!ncsw))
1928 ncsw = 1;
1929 }
1930 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1931 1945
1932 /* 1946 /*
@@ -2286,7 +2300,7 @@ out_running:
2286 trace_mark(kernel_sched_wakeup, 2300 trace_mark(kernel_sched_wakeup,
2287 "pid %d state %ld ## rq %p task %p rq->curr %p", 2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2288 p->pid, p->state, rq, p, rq->curr); 2302 p->pid, p->state, rq, p, rq->curr);
2289 check_preempt_curr(rq, p); 2303 check_preempt_curr(rq, p, sync);
2290 2304
2291 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2292#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2421,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2421 trace_mark(kernel_sched_wakeup_new, 2435 trace_mark(kernel_sched_wakeup_new,
2422 "pid %d state %ld ## rq %p task %p rq->curr %p", 2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2423 p->pid, p->state, rq, p, rq->curr); 2437 p->pid, p->state, rq, p, rq->curr);
2424 check_preempt_curr(rq, p); 2438 check_preempt_curr(rq, p, 0);
2425#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2426 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2427 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2759,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2759 } else { 2773 } else {
2760 if (rq1 < rq2) { 2774 if (rq1 < rq2) {
2761 spin_lock(&rq1->lock); 2775 spin_lock(&rq1->lock);
2762 spin_lock(&rq2->lock); 2776 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2763 } else { 2777 } else {
2764 spin_lock(&rq2->lock); 2778 spin_lock(&rq2->lock);
2765 spin_lock(&rq1->lock); 2779 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2766 } 2780 }
2767 } 2781 }
2768 update_rq_clock(rq1); 2782 update_rq_clock(rq1);
@@ -2805,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2805 if (busiest < this_rq) { 2819 if (busiest < this_rq) {
2806 spin_unlock(&this_rq->lock); 2820 spin_unlock(&this_rq->lock);
2807 spin_lock(&busiest->lock); 2821 spin_lock(&busiest->lock);
2808 spin_lock(&this_rq->lock); 2822 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2809 ret = 1; 2823 ret = 1;
2810 } else 2824 } else
2811 spin_lock(&busiest->lock); 2825 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2812 } 2826 }
2813 return ret; 2827 return ret;
2814} 2828}
2815 2829
2830static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2831 __releases(busiest->lock)
2832{
2833 spin_unlock(&busiest->lock);
2834 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2835}
2836
2816/* 2837/*
2817 * If dest_cpu is allowed for this process, migrate the task to it. 2838 * If dest_cpu is allowed for this process, migrate the task to it.
2818 * This is accomplished by forcing the cpu_allowed mask to only 2839 * This is accomplished by forcing the cpu_allowed mask to only
@@ -2874,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2874 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2875 * to be always true for them. 2896 * to be always true for them.
2876 */ 2897 */
2877 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2878} 2899}
2879 2900
2880/* 2901/*
@@ -3637,7 +3658,7 @@ redo:
3637 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3658 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3638 imbalance, sd, CPU_NEWLY_IDLE, 3659 imbalance, sd, CPU_NEWLY_IDLE,
3639 &all_pinned); 3660 &all_pinned);
3640 spin_unlock(&busiest->lock); 3661 double_unlock_balance(this_rq, busiest);
3641 3662
3642 if (unlikely(all_pinned)) { 3663 if (unlikely(all_pinned)) {
3643 cpu_clear(cpu_of(busiest), *cpus); 3664 cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3752 else 3773 else
3753 schedstat_inc(sd, alb_failed); 3774 schedstat_inc(sd, alb_failed);
3754 } 3775 }
3755 spin_unlock(&target_rq->lock); 3776 double_unlock_balance(busiest_rq, target_rq);
3756} 3777}
3757 3778
3758#ifdef CONFIG_NO_HZ 3779#ifdef CONFIG_NO_HZ
@@ -4173,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4173} 4194}
4174 4195
4175/* 4196/*
4197 * Use precise platform statistics if available:
4198 */
4199#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4200cputime_t task_utime(struct task_struct *p)
4201{
4202 return p->utime;
4203}
4204
4205cputime_t task_stime(struct task_struct *p)
4206{
4207 return p->stime;
4208}
4209#else
4210cputime_t task_utime(struct task_struct *p)
4211{
4212 clock_t utime = cputime_to_clock_t(p->utime),
4213 total = utime + cputime_to_clock_t(p->stime);
4214 u64 temp;
4215
4216 /*
4217 * Use CFS's precise accounting:
4218 */
4219 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4220
4221 if (total) {
4222 temp *= utime;
4223 do_div(temp, total);
4224 }
4225 utime = (clock_t)temp;
4226
4227 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4228 return p->prev_utime;
4229}
4230
4231cputime_t task_stime(struct task_struct *p)
4232{
4233 clock_t stime;
4234
4235 /*
4236 * Use CFS's precise accounting. (we subtract utime from
4237 * the total, to make sure the total observed by userspace
4238 * grows monotonically - apps rely on that):
4239 */
4240 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4241 cputime_to_clock_t(task_utime(p));
4242
4243 if (stime >= 0)
4244 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4245
4246 return p->prev_stime;
4247}
4248#endif
4249
4250inline cputime_t task_gtime(struct task_struct *p)
4251{
4252 return p->gtime;
4253}
4254
4255/*
4176 * This function gets called by the timer code, with HZ frequency. 4256 * This function gets called by the timer code, with HZ frequency.
4177 * We call it with interrupts disabled. 4257 * We call it with interrupts disabled.
4178 * 4258 *
@@ -4562,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4562} 4642}
4563EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4564 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4565void complete(struct completion *x) 4654void complete(struct completion *x)
4566{ 4655{
4567 unsigned long flags; 4656 unsigned long flags;
@@ -4573,6 +4662,12 @@ void complete(struct completion *x)
4573} 4662}
4574EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4575 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4576void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4577{ 4672{
4578 unsigned long flags; 4673 unsigned long flags;
@@ -4593,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4593 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4594 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4595 do { 4690 do {
4596 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4597 signal_pending(current)) ||
4598 (state == TASK_KILLABLE &&
4599 fatal_signal_pending(current))) {
4600 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4601 break; 4693 break;
4602 } 4694 }
@@ -4624,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4624 return timeout; 4716 return timeout;
4625} 4717}
4626 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4627void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4628{ 4730{
4629 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4630} 4732}
4631EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4632 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4633unsigned long __sched 4744unsigned long __sched
4634wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4635{ 4746{
@@ -4637,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4637} 4748}
4638EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4639 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4640int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4641{ 4759{
4642 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4646,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4646} 4764}
4647EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4648 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4649unsigned long __sched 4775unsigned long __sched
4650wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4651 unsigned long timeout) 4777 unsigned long timeout)
@@ -4654,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4654} 4780}
4655EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4656 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4657int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4658{ 4791{
4659 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -4663,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x)
4663} 4796}
4664EXPORT_SYMBOL(wait_for_completion_killable); 4797EXPORT_SYMBOL(wait_for_completion_killable);
4665 4798
4799/**
4800 * try_wait_for_completion - try to decrement a completion without blocking
4801 * @x: completion structure
4802 *
4803 * Returns: 0 if a decrement cannot be done without blocking
4804 * 1 if a decrement succeeded.
4805 *
4806 * If a completion is being used as a counting completion,
4807 * attempt to decrement the counter without blocking. This
4808 * enables us to avoid waiting if the resource the completion
4809 * is protecting is not available.
4810 */
4811bool try_wait_for_completion(struct completion *x)
4812{
4813 int ret = 1;
4814
4815 spin_lock_irq(&x->wait.lock);
4816 if (!x->done)
4817 ret = 0;
4818 else
4819 x->done--;
4820 spin_unlock_irq(&x->wait.lock);
4821 return ret;
4822}
4823EXPORT_SYMBOL(try_wait_for_completion);
4824
4825/**
4826 * completion_done - Test to see if a completion has any waiters
4827 * @x: completion structure
4828 *
4829 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4830 * 1 if there are no waiters.
4831 *
4832 */
4833bool completion_done(struct completion *x)
4834{
4835 int ret = 1;
4836
4837 spin_lock_irq(&x->wait.lock);
4838 if (!x->done)
4839 ret = 0;
4840 spin_unlock_irq(&x->wait.lock);
4841 return ret;
4842}
4843EXPORT_SYMBOL(completion_done);
4844
4666static long __sched 4845static long __sched
4667sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4846sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4668{ 4847{
@@ -5010,7 +5189,8 @@ recheck:
5010 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
5011 * assigned. 5190 * assigned.
5012 */ 5191 */
5013 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5014 return -EPERM; 5194 return -EPERM;
5015#endif 5195#endif
5016 5196
@@ -5734,6 +5914,8 @@ static inline void sched_init_granularity(void)
5734 sysctl_sched_latency = limit; 5914 sysctl_sched_latency = limit;
5735 5915
5736 sysctl_sched_wakeup_granularity *= factor; 5916 sysctl_sched_wakeup_granularity *= factor;
5917
5918 sysctl_sched_shares_ratelimit *= factor;
5737} 5919}
5738 5920
5739#ifdef CONFIG_SMP 5921#ifdef CONFIG_SMP
@@ -5844,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5844 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5845 if (on_rq) { 6027 if (on_rq) {
5846 activate_task(rq_dest, p, 0); 6028 activate_task(rq_dest, p, 0);
5847 check_preempt_curr(rq_dest, p); 6029 check_preempt_curr(rq_dest, p, 0);
5848 } 6030 }
5849done: 6031done:
5850 ret = 1; 6032 ret = 1;
@@ -6169,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
6169static struct ctl_table * 6351static struct ctl_table *
6170sd_alloc_ctl_domain_table(struct sched_domain *sd) 6352sd_alloc_ctl_domain_table(struct sched_domain *sd)
6171{ 6353{
6172 struct ctl_table *table = sd_alloc_ctl_entry(12); 6354 struct ctl_table *table = sd_alloc_ctl_entry(13);
6173 6355
6174 if (table == NULL) 6356 if (table == NULL)
6175 return NULL; 6357 return NULL;
@@ -6197,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6197 sizeof(int), 0644, proc_dointvec_minmax); 6379 sizeof(int), 0644, proc_dointvec_minmax);
6198 set_table_entry(&table[10], "flags", &sd->flags, 6380 set_table_entry(&table[10], "flags", &sd->flags,
6199 sizeof(int), 0644, proc_dointvec_minmax); 6381 sizeof(int), 0644, proc_dointvec_minmax);
6200 /* &table[11] is terminator */ 6382 set_table_entry(&table[11], "name", sd->name,
6383 CORENAME_MAX_SIZE, 0444, proc_dostring);
6384 /* &table[12] is terminator */
6201 6385
6202 return table; 6386 return table;
6203} 6387}
@@ -7081,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7081 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7265 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7082 */ 7266 */
7083 7267
7268#ifdef CONFIG_SCHED_DEBUG
7269# define SD_INIT_NAME(sd, type) sd->name = #type
7270#else
7271# define SD_INIT_NAME(sd, type) do { } while (0)
7272#endif
7273
7084#define SD_INIT(sd, type) sd_init_##type(sd) 7274#define SD_INIT(sd, type) sd_init_##type(sd)
7275
7085#define SD_INIT_FUNC(type) \ 7276#define SD_INIT_FUNC(type) \
7086static noinline void sd_init_##type(struct sched_domain *sd) \ 7277static noinline void sd_init_##type(struct sched_domain *sd) \
7087{ \ 7278{ \
7088 memset(sd, 0, sizeof(*sd)); \ 7279 memset(sd, 0, sizeof(*sd)); \
7089 *sd = SD_##type##_INIT; \ 7280 *sd = SD_##type##_INIT; \
7090 sd->level = SD_LV_##type; \ 7281 sd->level = SD_LV_##type; \
7282 SD_INIT_NAME(sd, type); \
7091} 7283}
7092 7284
7093SD_INIT_FUNC(CPU) 7285SD_INIT_FUNC(CPU)
@@ -7583,24 +7775,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7583 * and partition_sched_domains() will fallback to the single partition 7775 * and partition_sched_domains() will fallback to the single partition
7584 * 'fallback_doms', it also forces the domains to be rebuilt. 7776 * 'fallback_doms', it also forces the domains to be rebuilt.
7585 * 7777 *
7778 * If doms_new==NULL it will be replaced with cpu_online_map.
7779 * ndoms_new==0 is a special case for destroying existing domains.
7780 * It will not create the default domain.
7781 *
7586 * Call with hotplug lock held 7782 * Call with hotplug lock held
7587 */ 7783 */
7588void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7784void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7589 struct sched_domain_attr *dattr_new) 7785 struct sched_domain_attr *dattr_new)
7590{ 7786{
7591 int i, j; 7787 int i, j, n;
7592 7788
7593 mutex_lock(&sched_domains_mutex); 7789 mutex_lock(&sched_domains_mutex);
7594 7790
7595 /* always unregister in case we don't destroy any domains */ 7791 /* always unregister in case we don't destroy any domains */
7596 unregister_sched_domain_sysctl(); 7792 unregister_sched_domain_sysctl();
7597 7793
7598 if (doms_new == NULL) 7794 n = doms_new ? ndoms_new : 0;
7599 ndoms_new = 0;
7600 7795
7601 /* Destroy deleted domains */ 7796 /* Destroy deleted domains */
7602 for (i = 0; i < ndoms_cur; i++) { 7797 for (i = 0; i < ndoms_cur; i++) {
7603 for (j = 0; j < ndoms_new; j++) { 7798 for (j = 0; j < n; j++) {
7604 if (cpus_equal(doms_cur[i], doms_new[j]) 7799 if (cpus_equal(doms_cur[i], doms_new[j])
7605 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7800 && dattrs_equal(dattr_cur, i, dattr_new, j))
7606 goto match1; 7801 goto match1;
@@ -7613,7 +7808,6 @@ match1:
7613 7808
7614 if (doms_new == NULL) { 7809 if (doms_new == NULL) {
7615 ndoms_cur = 0; 7810 ndoms_cur = 0;
7616 ndoms_new = 1;
7617 doms_new = &fallback_doms; 7811 doms_new = &fallback_doms;
7618 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7812 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7619 dattr_new = NULL; 7813 dattr_new = NULL;
@@ -7650,8 +7844,13 @@ match2:
7650int arch_reinit_sched_domains(void) 7844int arch_reinit_sched_domains(void)
7651{ 7845{
7652 get_online_cpus(); 7846 get_online_cpus();
7847
7848 /* Destroy domains first to force the rebuild */
7849 partition_sched_domains(0, NULL, NULL);
7850
7653 rebuild_sched_domains(); 7851 rebuild_sched_domains();
7654 put_online_cpus(); 7852 put_online_cpus();
7853
7655 return 0; 7854 return 0;
7656} 7855}
7657 7856
@@ -7735,7 +7934,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7735 case CPU_ONLINE_FROZEN: 7934 case CPU_ONLINE_FROZEN:
7736 case CPU_DEAD: 7935 case CPU_DEAD:
7737 case CPU_DEAD_FROZEN: 7936 case CPU_DEAD_FROZEN:
7738 partition_sched_domains(0, NULL, NULL); 7937 partition_sched_domains(1, NULL, NULL);
7739 return NOTIFY_OK; 7938 return NOTIFY_OK;
7740 7939
7741 default: 7940 default:
@@ -8000,7 +8199,6 @@ void __init sched_init(void)
8000 8199
8001 rq = cpu_rq(i); 8200 rq = cpu_rq(i);
8002 spin_lock_init(&rq->lock); 8201 spin_lock_init(&rq->lock);
8003 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8004 rq->nr_running = 0; 8202 rq->nr_running = 0;
8005 init_cfs_rq(&rq->cfs, rq); 8203 init_cfs_rq(&rq->cfs, rq);
8006 init_rt_rq(&rq->rt, rq); 8204 init_rt_rq(&rq->rt, rq);
@@ -8123,20 +8321,25 @@ void __might_sleep(char *file, int line)
8123#ifdef in_atomic 8321#ifdef in_atomic
8124 static unsigned long prev_jiffy; /* ratelimiting */ 8322 static unsigned long prev_jiffy; /* ratelimiting */
8125 8323
8126 if ((in_atomic() || irqs_disabled()) && 8324 if ((!in_atomic() && !irqs_disabled()) ||
8127 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8325 system_state != SYSTEM_RUNNING || oops_in_progress)
8128 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8326 return;
8129 return; 8327 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8130 prev_jiffy = jiffies; 8328 return;
8131 printk(KERN_ERR "BUG: sleeping function called from invalid" 8329 prev_jiffy = jiffies;
8132 " context at %s:%d\n", file, line); 8330
8133 printk("in_atomic():%d, irqs_disabled():%d\n", 8331 printk(KERN_ERR
8134 in_atomic(), irqs_disabled()); 8332 "BUG: sleeping function called from invalid context at %s:%d\n",
8135 debug_show_held_locks(current); 8333 file, line);
8136 if (irqs_disabled()) 8334 printk(KERN_ERR
8137 print_irqtrace_events(current); 8335 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8138 dump_stack(); 8336 in_atomic(), irqs_disabled(),
8139 } 8337 current->pid, current->comm);
8338
8339 debug_show_held_locks(current);
8340 if (irqs_disabled())
8341 print_irqtrace_events(current);
8342 dump_stack();
8140#endif 8343#endif
8141} 8344}
8142EXPORT_SYMBOL(__might_sleep); 8345EXPORT_SYMBOL(__might_sleep);
@@ -8457,8 +8660,8 @@ struct task_group *sched_create_group(struct task_group *parent)
8457 WARN_ON(!parent); /* root should already exist */ 8660 WARN_ON(!parent); /* root should already exist */
8458 8661
8459 tg->parent = parent; 8662 tg->parent = parent;
8460 list_add_rcu(&tg->siblings, &parent->children);
8461 INIT_LIST_HEAD(&tg->children); 8663 INIT_LIST_HEAD(&tg->children);
8664 list_add_rcu(&tg->siblings, &parent->children);
8462 spin_unlock_irqrestore(&task_group_lock, flags); 8665 spin_unlock_irqrestore(&task_group_lock, flags);
8463 8666
8464 return tg; 8667 return tg;
@@ -8634,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8634static unsigned long to_ratio(u64 period, u64 runtime) 8837static unsigned long to_ratio(u64 period, u64 runtime)
8635{ 8838{
8636 if (runtime == RUNTIME_INF) 8839 if (runtime == RUNTIME_INF)
8637 return 1ULL << 16; 8840 return 1ULL << 20;
8638 8841
8639 return div64_u64(runtime << 16, period); 8842 return div64_u64(runtime << 20, period);
8640} 8843}
8641 8844
8642#ifdef CONFIG_CGROUP_SCHED 8845/* Must be called with tasklist_lock held */
8643static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8846static inline int tg_has_rt_tasks(struct task_group *tg)
8644{ 8847{
8645 struct task_group *tgi, *parent = tg->parent; 8848 struct task_struct *g, *p;
8646 unsigned long total = 0;
8647 8849
8648 if (!parent) { 8850 do_each_thread(g, p) {
8649 if (global_rt_period() < period) 8851 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8650 return 0; 8852 return 1;
8853 } while_each_thread(g, p);
8651 8854
8652 return to_ratio(period, runtime) < 8855 return 0;
8653 to_ratio(global_rt_period(), global_rt_runtime()); 8856}
8654 }
8655 8857
8656 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8858struct rt_schedulable_data {
8657 return 0; 8859 struct task_group *tg;
8860 u64 rt_period;
8861 u64 rt_runtime;
8862};
8658 8863
8659 rcu_read_lock(); 8864static int tg_schedulable(struct task_group *tg, void *data)
8660 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8865{
8661 if (tgi == tg) 8866 struct rt_schedulable_data *d = data;
8662 continue; 8867 struct task_group *child;
8868 unsigned long total, sum = 0;
8869 u64 period, runtime;
8870
8871 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8872 runtime = tg->rt_bandwidth.rt_runtime;
8663 8873
8664 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8874 if (tg == d->tg) {
8665 tgi->rt_bandwidth.rt_runtime); 8875 period = d->rt_period;
8876 runtime = d->rt_runtime;
8666 } 8877 }
8667 rcu_read_unlock();
8668 8878
8669 return total + to_ratio(period, runtime) <= 8879 /*
8670 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8880 * Cannot have more runtime than the period.
8671 parent->rt_bandwidth.rt_runtime); 8881 */
8672} 8882 if (runtime > period && runtime != RUNTIME_INF)
8673#elif defined CONFIG_USER_SCHED 8883 return -EINVAL;
8674static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8675{
8676 struct task_group *tgi;
8677 unsigned long total = 0;
8678 unsigned long global_ratio =
8679 to_ratio(global_rt_period(), global_rt_runtime());
8680 8884
8681 rcu_read_lock(); 8885 /*
8682 list_for_each_entry_rcu(tgi, &task_groups, list) { 8886 * Ensure we don't starve existing RT tasks.
8683 if (tgi == tg) 8887 */
8684 continue; 8888 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8889 return -EBUSY;
8685 8890
8686 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8891 total = to_ratio(period, runtime);
8687 tgi->rt_bandwidth.rt_runtime); 8892
8893 /*
8894 * Nobody can have more than the global setting allows.
8895 */
8896 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8897 return -EINVAL;
8898
8899 /*
8900 * The sum of our children's runtime should not exceed our own.
8901 */
8902 list_for_each_entry_rcu(child, &tg->children, siblings) {
8903 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8904 runtime = child->rt_bandwidth.rt_runtime;
8905
8906 if (child == d->tg) {
8907 period = d->rt_period;
8908 runtime = d->rt_runtime;
8909 }
8910
8911 sum += to_ratio(period, runtime);
8688 } 8912 }
8689 rcu_read_unlock();
8690 8913
8691 return total + to_ratio(period, runtime) < global_ratio; 8914 if (sum > total)
8915 return -EINVAL;
8916
8917 return 0;
8692} 8918}
8693#endif
8694 8919
8695/* Must be called with tasklist_lock held */ 8920static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8696static inline int tg_has_rt_tasks(struct task_group *tg)
8697{ 8921{
8698 struct task_struct *g, *p; 8922 struct rt_schedulable_data data = {
8699 do_each_thread(g, p) { 8923 .tg = tg,
8700 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8924 .rt_period = period,
8701 return 1; 8925 .rt_runtime = runtime,
8702 } while_each_thread(g, p); 8926 };
8703 return 0; 8927
8928 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8704} 8929}
8705 8930
8706static int tg_set_bandwidth(struct task_group *tg, 8931static int tg_set_bandwidth(struct task_group *tg,
@@ -8710,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8710 8935
8711 mutex_lock(&rt_constraints_mutex); 8936 mutex_lock(&rt_constraints_mutex);
8712 read_lock(&tasklist_lock); 8937 read_lock(&tasklist_lock);
8713 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8938 err = __rt_schedulable(tg, rt_period, rt_runtime);
8714 err = -EBUSY; 8939 if (err)
8715 goto unlock; 8940 goto unlock;
8716 }
8717 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8718 err = -EINVAL;
8719 goto unlock;
8720 }
8721 8941
8722 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8942 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8723 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8943 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8786,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
8786 9006
8787static int sched_rt_global_constraints(void) 9007static int sched_rt_global_constraints(void)
8788{ 9008{
8789 struct task_group *tg = &root_task_group; 9009 u64 runtime, period;
8790 u64 rt_runtime, rt_period;
8791 int ret = 0; 9010 int ret = 0;
8792 9011
8793 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9012 if (sysctl_sched_rt_period <= 0)
8794 rt_runtime = tg->rt_bandwidth.rt_runtime; 9013 return -EINVAL;
9014
9015 runtime = global_rt_runtime();
9016 period = global_rt_period();
9017
9018 /*
9019 * Sanity check on the sysctl variables.
9020 */
9021 if (runtime > period && runtime != RUNTIME_INF)
9022 return -EINVAL;
8795 9023
8796 mutex_lock(&rt_constraints_mutex); 9024 mutex_lock(&rt_constraints_mutex);
8797 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9025 read_lock(&tasklist_lock);
8798 ret = -EINVAL; 9026 ret = __rt_schedulable(NULL, 0, 0);
9027 read_unlock(&tasklist_lock);
8799 mutex_unlock(&rt_constraints_mutex); 9028 mutex_unlock(&rt_constraints_mutex);
8800 9029
8801 return ret; 9030 return ret;
@@ -8806,6 +9035,9 @@ static int sched_rt_global_constraints(void)
8806 unsigned long flags; 9035 unsigned long flags;
8807 int i; 9036 int i;
8808 9037
9038 if (sysctl_sched_rt_period <= 0)
9039 return -EINVAL;
9040
8809 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9041 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8810 for_each_possible_cpu(i) { 9042 for_each_possible_cpu(i) {
8811 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9043 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8866,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8866 9098
8867 if (!cgrp->parent) { 9099 if (!cgrp->parent) {
8868 /* This is early initialization for the top cgroup */ 9100 /* This is early initialization for the top cgroup */
8869 init_task_group.css.cgroup = cgrp;
8870 return &init_task_group.css; 9101 return &init_task_group.css;
8871 } 9102 }
8872 9103
@@ -8875,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8875 if (IS_ERR(tg)) 9106 if (IS_ERR(tg))
8876 return ERR_PTR(-ENOMEM); 9107 return ERR_PTR(-ENOMEM);
8877 9108
8878 /* Bind the cgroup to task_group object we just created */
8879 tg->css.cgroup = cgrp;
8880
8881 return &tg->css; 9109 return &tg->css;
8882} 9110}
8883 9111