aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c359
1 files changed, 228 insertions, 131 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962dc0aa2..2caedc47e764 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 205}
206 206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
210}
211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 305#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 306#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 307#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 308#endif /* CONFIG_USER_SCHED */
304 309
305/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -604,9 +609,9 @@ struct rq {
604 609
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 611
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 613{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 615}
611 616
612static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1108}
1104 1109
1105static void init_hrtick(void) 1110static inline void init_hrtick(void)
1106{ 1111{
1107} 1112}
1108#endif /* CONFIG_SMP */ 1113#endif /* CONFIG_SMP */
@@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq)
1121 rq->hrtick_timer.function = hrtick; 1126 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1128}
1124#else 1129#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1126{ 1131{
1127} 1132}
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1134{ 1139{
1135} 1140}
1136#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1137 1142
1138/* 1143/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1381} 1386}
1382 1387
1383#ifdef CONFIG_SMP 1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1389typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1390
1402/* 1391/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1392 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1393 * leaving it for the final time.
1405 */ 1394 */
1406static void 1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1396{
1409 struct task_group *parent, *child; 1397 struct task_group *parent, *child;
1398 int ret;
1410 1399
1411 rcu_read_lock(); 1400 rcu_read_lock();
1412 parent = &root_task_group; 1401 parent = &root_task_group;
1413down: 1402down:
1414 (*down)(parent, cpu, sd); 1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1407 parent = child;
1417 goto down; 1408 goto down;
@@ -1419,15 +1410,43 @@ down:
1419up: 1410up:
1420 continue; 1411 continue;
1421 } 1412 }
1422 (*up)(parent, cpu, sd); 1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1423 1416
1424 child = parent; 1417 child = parent;
1425 parent = parent->parent; 1418 parent = parent->parent;
1426 if (parent) 1419 if (parent)
1427 goto up; 1420 goto up;
1421out_unlock:
1428 rcu_read_unlock(); 1422 rcu_read_unlock();
1423
1424 return ret;
1425}
1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1430}
1431#endif
1432
1433#ifdef CONFIG_SMP
1434static unsigned long source_load(int cpu, int type);
1435static unsigned long target_load(int cpu, int type);
1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1429} 1446}
1430 1447
1448#ifdef CONFIG_FAIR_GROUP_SCHED
1449
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1451
1433/* 1452/*
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1506 * parent group depends on the shares of its child groups.
1488 */ 1507 */
1489static void 1508static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1509{
1492 unsigned long rq_weight = 0; 1510 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1494 int i; 1513 int i;
1495 1514
1496 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1515 __update_group_shares_cpu(tg, i, shares, rq_weight); 1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags); 1535 spin_unlock_irqrestore(&rq->lock, flags);
1517 } 1536 }
1537
1538 return 0;
1518} 1539}
1519 1540
1520/* 1541/*
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1543 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1544 * group is a fraction of its parents load.
1524 */ 1545 */
1525static void 1546static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1547{
1528 unsigned long load; 1548 unsigned long load;
1549 long cpu = (long)data;
1529 1550
1530 if (!tg->parent) { 1551 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1552 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1557 }
1537 1558
1538 tg->cfs_rq[cpu]->h_load = load; 1559 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1560
1541static void 1561 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1562}
1545 1563
1546static void update_shares(struct sched_domain *sd) 1564static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
1550 1568
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1570 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1572 }
1555} 1573}
1556 1574
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1579 spin_lock(&rq->lock);
1562} 1580}
1563 1581
1564static void update_h_load(int cpu) 1582static void update_h_load(long cpu)
1565{ 1583{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1585}
1568 1586
1569#else 1587#else
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1921 running = task_running(rq, p); 1939 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1923 ncsw = 0; 1941 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1942 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1930 1945
1931 /* 1946 /*
@@ -2285,7 +2300,7 @@ out_running:
2285 trace_mark(kernel_sched_wakeup, 2300 trace_mark(kernel_sched_wakeup,
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2287 p->pid, p->state, rq, p, rq->curr); 2302 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p); 2303 check_preempt_curr(rq, p, sync);
2289 2304
2290 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2420 trace_mark(kernel_sched_wakeup_new, 2435 trace_mark(kernel_sched_wakeup_new,
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2422 p->pid, p->state, rq, p, rq->curr); 2437 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p); 2438 check_preempt_curr(rq, p, 0);
2424#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2896 * to be always true for them.
2882 */ 2897 */
2883 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2884} 2899}
2885 2900
2886/* 2901/*
@@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4642}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4630void complete(struct completion *x) 4654void complete(struct completion *x)
4631{ 4655{
4632 unsigned long flags; 4656 unsigned long flags;
@@ -4638,6 +4662,12 @@ void complete(struct completion *x)
4638} 4662}
4639EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4640 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4641void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4642{ 4672{
4643 unsigned long flags; 4673 unsigned long flags;
@@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4690 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4666 break; 4693 break;
4667 } 4694 }
@@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4716 return timeout;
4690} 4717}
4691 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4692void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4693{ 4730{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4732}
4696EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4697 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4698unsigned long __sched 4744unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4746{
@@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4748}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4759{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4764}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4714unsigned long __sched 4775unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4777 unsigned long timeout)
@@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4780}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4722int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4723{ 4791{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5189,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5190 * assigned.
5123 */ 5191 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5194 return -EPERM;
5126#endif 5195#endif
5127 5196
@@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6027 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6028 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6029 check_preempt_curr(rq_dest, p, 0);
5961 } 6030 }
5962done: 6031done:
5963 ret = 1; 6032 ret = 1;
@@ -8242,20 +8311,25 @@ void __might_sleep(char *file, int line)
8242#ifdef in_atomic 8311#ifdef in_atomic
8243 static unsigned long prev_jiffy; /* ratelimiting */ 8312 static unsigned long prev_jiffy; /* ratelimiting */
8244 8313
8245 if ((in_atomic() || irqs_disabled()) && 8314 if ((!in_atomic() && !irqs_disabled()) ||
8246 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8315 system_state != SYSTEM_RUNNING || oops_in_progress)
8247 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8316 return;
8248 return; 8317 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8249 prev_jiffy = jiffies; 8318 return;
8250 printk(KERN_ERR "BUG: sleeping function called from invalid" 8319 prev_jiffy = jiffies;
8251 " context at %s:%d\n", file, line); 8320
8252 printk("in_atomic():%d, irqs_disabled():%d\n", 8321 printk(KERN_ERR
8253 in_atomic(), irqs_disabled()); 8322 "BUG: sleeping function called from invalid context at %s:%d\n",
8254 debug_show_held_locks(current); 8323 file, line);
8255 if (irqs_disabled()) 8324 printk(KERN_ERR
8256 print_irqtrace_events(current); 8325 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8257 dump_stack(); 8326 in_atomic(), irqs_disabled(),
8258 } 8327 current->pid, current->comm);
8328
8329 debug_show_held_locks(current);
8330 if (irqs_disabled())
8331 print_irqtrace_events(current);
8332 dump_stack();
8259#endif 8333#endif
8260} 8334}
8261EXPORT_SYMBOL(__might_sleep); 8335EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8827,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8753static unsigned long to_ratio(u64 period, u64 runtime) 8827static unsigned long to_ratio(u64 period, u64 runtime)
8754{ 8828{
8755 if (runtime == RUNTIME_INF) 8829 if (runtime == RUNTIME_INF)
8756 return 1ULL << 16; 8830 return 1ULL << 20;
8757 8831
8758 return div64_u64(runtime << 16, period); 8832 return div64_u64(runtime << 20, period);
8759} 8833}
8760 8834
8761#ifdef CONFIG_CGROUP_SCHED 8835/* Must be called with tasklist_lock held */
8762static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8836static inline int tg_has_rt_tasks(struct task_group *tg)
8763{ 8837{
8764 struct task_group *tgi, *parent = tg->parent; 8838 struct task_struct *g, *p;
8765 unsigned long total = 0;
8766 8839
8767 if (!parent) { 8840 do_each_thread(g, p) {
8768 if (global_rt_period() < period) 8841 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8769 return 0; 8842 return 1;
8843 } while_each_thread(g, p);
8770 8844
8771 return to_ratio(period, runtime) < 8845 return 0;
8772 to_ratio(global_rt_period(), global_rt_runtime()); 8846}
8773 }
8774 8847
8775 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8848struct rt_schedulable_data {
8776 return 0; 8849 struct task_group *tg;
8850 u64 rt_period;
8851 u64 rt_runtime;
8852};
8777 8853
8778 rcu_read_lock(); 8854static int tg_schedulable(struct task_group *tg, void *data)
8779 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8855{
8780 if (tgi == tg) 8856 struct rt_schedulable_data *d = data;
8781 continue; 8857 struct task_group *child;
8858 unsigned long total, sum = 0;
8859 u64 period, runtime;
8860
8861 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8862 runtime = tg->rt_bandwidth.rt_runtime;
8782 8863
8783 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8864 if (tg == d->tg) {
8784 tgi->rt_bandwidth.rt_runtime); 8865 period = d->rt_period;
8866 runtime = d->rt_runtime;
8785 } 8867 }
8786 rcu_read_unlock();
8787 8868
8788 return total + to_ratio(period, runtime) <= 8869 /*
8789 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8870 * Cannot have more runtime than the period.
8790 parent->rt_bandwidth.rt_runtime); 8871 */
8791} 8872 if (runtime > period && runtime != RUNTIME_INF)
8792#elif defined CONFIG_USER_SCHED 8873 return -EINVAL;
8793static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8794{
8795 struct task_group *tgi;
8796 unsigned long total = 0;
8797 unsigned long global_ratio =
8798 to_ratio(global_rt_period(), global_rt_runtime());
8799 8874
8800 rcu_read_lock(); 8875 /*
8801 list_for_each_entry_rcu(tgi, &task_groups, list) { 8876 * Ensure we don't starve existing RT tasks.
8802 if (tgi == tg) 8877 */
8803 continue; 8878 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8879 return -EBUSY;
8880
8881 total = to_ratio(period, runtime);
8804 8882
8805 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8883 /*
8806 tgi->rt_bandwidth.rt_runtime); 8884 * Nobody can have more than the global setting allows.
8885 */
8886 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8887 return -EINVAL;
8888
8889 /*
8890 * The sum of our children's runtime should not exceed our own.
8891 */
8892 list_for_each_entry_rcu(child, &tg->children, siblings) {
8893 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8894 runtime = child->rt_bandwidth.rt_runtime;
8895
8896 if (child == d->tg) {
8897 period = d->rt_period;
8898 runtime = d->rt_runtime;
8899 }
8900
8901 sum += to_ratio(period, runtime);
8807 } 8902 }
8808 rcu_read_unlock();
8809 8903
8810 return total + to_ratio(period, runtime) < global_ratio; 8904 if (sum > total)
8905 return -EINVAL;
8906
8907 return 0;
8811} 8908}
8812#endif
8813 8909
8814/* Must be called with tasklist_lock held */ 8910static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8815static inline int tg_has_rt_tasks(struct task_group *tg)
8816{ 8911{
8817 struct task_struct *g, *p; 8912 struct rt_schedulable_data data = {
8818 do_each_thread(g, p) { 8913 .tg = tg,
8819 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8914 .rt_period = period,
8820 return 1; 8915 .rt_runtime = runtime,
8821 } while_each_thread(g, p); 8916 };
8822 return 0; 8917
8918 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8823} 8919}
8824 8920
8825static int tg_set_bandwidth(struct task_group *tg, 8921static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8925,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8829 8925
8830 mutex_lock(&rt_constraints_mutex); 8926 mutex_lock(&rt_constraints_mutex);
8831 read_lock(&tasklist_lock); 8927 read_lock(&tasklist_lock);
8832 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8928 err = __rt_schedulable(tg, rt_period, rt_runtime);
8833 err = -EBUSY; 8929 if (err)
8834 goto unlock; 8930 goto unlock;
8835 }
8836 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8837 err = -EINVAL;
8838 goto unlock;
8839 }
8840 8931
8841 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8932 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8842 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8933 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,19 +8996,25 @@ long sched_group_rt_period(struct task_group *tg)
8905 8996
8906static int sched_rt_global_constraints(void) 8997static int sched_rt_global_constraints(void)
8907{ 8998{
8908 struct task_group *tg = &root_task_group; 8999 u64 runtime, period;
8909 u64 rt_runtime, rt_period;
8910 int ret = 0; 9000 int ret = 0;
8911 9001
8912 if (sysctl_sched_rt_period <= 0) 9002 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL; 9003 return -EINVAL;
8914 9004
8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9005 runtime = global_rt_runtime();
8916 rt_runtime = tg->rt_bandwidth.rt_runtime; 9006 period = global_rt_period();
9007
9008 /*
9009 * Sanity check on the sysctl variables.
9010 */
9011 if (runtime > period && runtime != RUNTIME_INF)
9012 return -EINVAL;
8917 9013
8918 mutex_lock(&rt_constraints_mutex); 9014 mutex_lock(&rt_constraints_mutex);
8919 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9015 read_lock(&tasklist_lock);
8920 ret = -EINVAL; 9016 ret = __rt_schedulable(NULL, 0, 0);
9017 read_unlock(&tasklist_lock);
8921 mutex_unlock(&rt_constraints_mutex); 9018 mutex_unlock(&rt_constraints_mutex);
8922 9019
8923 return ret; 9020 return ret;