aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorArjan van de Ven <arjan@linux.intel.com>2008-10-17 12:20:26 -0400
committerArjan van de Ven <arjan@linux.intel.com>2008-10-17 12:20:26 -0400
commit651dab4264e4ba0e563f5ff56f748127246e9065 (patch)
tree016630974bdcb00fe529b673f96d389e0fd6dc94 /kernel/sched.c
parent40b8606253552109815786e5d4b0de98782d31f5 (diff)
parent2e532d68a2b3e2aa6b19731501222069735c741c (diff)
Merge commit 'linus/master' into merge-linus
Conflicts: arch/x86/kvm/i8254.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c408
1 files changed, 262 insertions, 146 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index e46b5afa200d..eb3c72953615 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
205} 210}
206 211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -297,9 +302,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
297static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 302static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
298static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 303static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
299#endif /* CONFIG_RT_GROUP_SCHED */ 304#endif /* CONFIG_RT_GROUP_SCHED */
300#else /* !CONFIG_FAIR_GROUP_SCHED */ 305#else /* !CONFIG_USER_SCHED */
301#define root_task_group init_task_group 306#define root_task_group init_task_group
302#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_USER_SCHED */
303 308
304/* task_group_lock serializes add/remove of task groups and also changes to 309/* task_group_lock serializes add/remove of task groups and also changes to
305 * a task group's cpu shares. 310 * a task group's cpu shares.
@@ -603,9 +608,9 @@ struct rq {
603 608
604static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 609static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
605 610
606static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 611static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
607{ 612{
608 rq->curr->sched_class->check_preempt_curr(rq, p); 613 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
609} 614}
610 615
611static inline int cpu_of(struct rq *rq) 616static inline int cpu_of(struct rq *rq)
@@ -1086,7 +1091,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1086 return NOTIFY_DONE; 1091 return NOTIFY_DONE;
1087} 1092}
1088 1093
1089static void init_hrtick(void) 1094static __init void init_hrtick(void)
1090{ 1095{
1091 hotcpu_notifier(hotplug_hrtick, 0); 1096 hotcpu_notifier(hotplug_hrtick, 0);
1092} 1097}
@@ -1101,7 +1106,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1101 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1106 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1102} 1107}
1103 1108
1104static void init_hrtick(void) 1109static inline void init_hrtick(void)
1105{ 1110{
1106} 1111}
1107#endif /* CONFIG_SMP */ 1112#endif /* CONFIG_SMP */
@@ -1118,9 +1123,9 @@ static void init_rq_hrtick(struct rq *rq)
1118 1123
1119 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1124 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1120 rq->hrtick_timer.function = hrtick; 1125 rq->hrtick_timer.function = hrtick;
1121 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1126 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1122} 1127}
1123#else 1128#else /* CONFIG_SCHED_HRTICK */
1124static inline void hrtick_clear(struct rq *rq) 1129static inline void hrtick_clear(struct rq *rq)
1125{ 1130{
1126} 1131}
@@ -1132,7 +1137,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1132static inline void init_hrtick(void) 1137static inline void init_hrtick(void)
1133{ 1138{
1134} 1139}
1135#endif 1140#endif /* CONFIG_SCHED_HRTICK */
1136 1141
1137/* 1142/*
1138 * resched_task - mark a task 'to be rescheduled now'. 1143 * resched_task - mark a task 'to be rescheduled now'.
@@ -1379,38 +1384,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1379 update_load_sub(&rq->load, load); 1384 update_load_sub(&rq->load, load);
1380} 1385}
1381 1386
1382#ifdef CONFIG_SMP 1387#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1383static unsigned long source_load(int cpu, int type); 1388typedef int (*tg_visitor)(struct task_group *, void *);
1384static unsigned long target_load(int cpu, int type);
1385static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1386
1387static unsigned long cpu_avg_load_per_task(int cpu)
1388{
1389 struct rq *rq = cpu_rq(cpu);
1390
1391 if (rq->nr_running)
1392 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1393
1394 return rq->avg_load_per_task;
1395}
1396
1397#ifdef CONFIG_FAIR_GROUP_SCHED
1398
1399typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1400 1389
1401/* 1390/*
1402 * Iterate the full tree, calling @down when first entering a node and @up when 1391 * Iterate the full tree, calling @down when first entering a node and @up when
1403 * leaving it for the final time. 1392 * leaving it for the final time.
1404 */ 1393 */
1405static void 1394static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1406walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1407{ 1395{
1408 struct task_group *parent, *child; 1396 struct task_group *parent, *child;
1397 int ret;
1409 1398
1410 rcu_read_lock(); 1399 rcu_read_lock();
1411 parent = &root_task_group; 1400 parent = &root_task_group;
1412down: 1401down:
1413 (*down)(parent, cpu, sd); 1402 ret = (*down)(parent, data);
1403 if (ret)
1404 goto out_unlock;
1414 list_for_each_entry_rcu(child, &parent->children, siblings) { 1405 list_for_each_entry_rcu(child, &parent->children, siblings) {
1415 parent = child; 1406 parent = child;
1416 goto down; 1407 goto down;
@@ -1418,15 +1409,43 @@ down:
1418up: 1409up:
1419 continue; 1410 continue;
1420 } 1411 }
1421 (*up)(parent, cpu, sd); 1412 ret = (*up)(parent, data);
1413 if (ret)
1414 goto out_unlock;
1422 1415
1423 child = parent; 1416 child = parent;
1424 parent = parent->parent; 1417 parent = parent->parent;
1425 if (parent) 1418 if (parent)
1426 goto up; 1419 goto up;
1420out_unlock:
1427 rcu_read_unlock(); 1421 rcu_read_unlock();
1422
1423 return ret;
1424}
1425
1426static int tg_nop(struct task_group *tg, void *data)
1427{
1428 return 0;
1429}
1430#endif
1431
1432#ifdef CONFIG_SMP
1433static unsigned long source_load(int cpu, int type);
1434static unsigned long target_load(int cpu, int type);
1435static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1436
1437static unsigned long cpu_avg_load_per_task(int cpu)
1438{
1439 struct rq *rq = cpu_rq(cpu);
1440
1441 if (rq->nr_running)
1442 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1443
1444 return rq->avg_load_per_task;
1428} 1445}
1429 1446
1447#ifdef CONFIG_FAIR_GROUP_SCHED
1448
1430static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1449static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1431 1450
1432/* 1451/*
@@ -1485,11 +1504,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1485 * This needs to be done in a bottom-up fashion because the rq weight of a 1504 * This needs to be done in a bottom-up fashion because the rq weight of a
1486 * parent group depends on the shares of its child groups. 1505 * parent group depends on the shares of its child groups.
1487 */ 1506 */
1488static void 1507static int tg_shares_up(struct task_group *tg, void *data)
1489tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1490{ 1508{
1491 unsigned long rq_weight = 0; 1509 unsigned long rq_weight = 0;
1492 unsigned long shares = 0; 1510 unsigned long shares = 0;
1511 struct sched_domain *sd = data;
1493 int i; 1512 int i;
1494 1513
1495 for_each_cpu_mask(i, sd->span) { 1514 for_each_cpu_mask(i, sd->span) {
@@ -1514,6 +1533,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1514 __update_group_shares_cpu(tg, i, shares, rq_weight); 1533 __update_group_shares_cpu(tg, i, shares, rq_weight);
1515 spin_unlock_irqrestore(&rq->lock, flags); 1534 spin_unlock_irqrestore(&rq->lock, flags);
1516 } 1535 }
1536
1537 return 0;
1517} 1538}
1518 1539
1519/* 1540/*
@@ -1521,10 +1542,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1521 * This needs to be done in a top-down fashion because the load of a child 1542 * This needs to be done in a top-down fashion because the load of a child
1522 * group is a fraction of its parents load. 1543 * group is a fraction of its parents load.
1523 */ 1544 */
1524static void 1545static int tg_load_down(struct task_group *tg, void *data)
1525tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1526{ 1546{
1527 unsigned long load; 1547 unsigned long load;
1548 long cpu = (long)data;
1528 1549
1529 if (!tg->parent) { 1550 if (!tg->parent) {
1530 load = cpu_rq(cpu)->load.weight; 1551 load = cpu_rq(cpu)->load.weight;
@@ -1535,11 +1556,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1535 } 1556 }
1536 1557
1537 tg->cfs_rq[cpu]->h_load = load; 1558 tg->cfs_rq[cpu]->h_load = load;
1538}
1539 1559
1540static void 1560 return 0;
1541tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1542{
1543} 1561}
1544 1562
1545static void update_shares(struct sched_domain *sd) 1563static void update_shares(struct sched_domain *sd)
@@ -1549,7 +1567,7 @@ static void update_shares(struct sched_domain *sd)
1549 1567
1550 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1568 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1551 sd->last_update = now; 1569 sd->last_update = now;
1552 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1570 walk_tg_tree(tg_nop, tg_shares_up, sd);
1553 } 1571 }
1554} 1572}
1555 1573
@@ -1560,9 +1578,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1560 spin_lock(&rq->lock); 1578 spin_lock(&rq->lock);
1561} 1579}
1562 1580
1563static void update_h_load(int cpu) 1581static void update_h_load(long cpu)
1564{ 1582{
1565 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1583 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1566} 1584}
1567 1585
1568#else 1586#else
@@ -1920,11 +1938,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1920 running = task_running(rq, p); 1938 running = task_running(rq, p);
1921 on_rq = p->se.on_rq; 1939 on_rq = p->se.on_rq;
1922 ncsw = 0; 1940 ncsw = 0;
1923 if (!match_state || p->state == match_state) { 1941 if (!match_state || p->state == match_state)
1924 ncsw = p->nivcsw + p->nvcsw; 1942 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1925 if (unlikely(!ncsw))
1926 ncsw = 1;
1927 }
1928 task_rq_unlock(rq, &flags); 1943 task_rq_unlock(rq, &flags);
1929 1944
1930 /* 1945 /*
@@ -2284,7 +2299,7 @@ out_running:
2284 trace_mark(kernel_sched_wakeup, 2299 trace_mark(kernel_sched_wakeup,
2285 "pid %d state %ld ## rq %p task %p rq->curr %p", 2300 "pid %d state %ld ## rq %p task %p rq->curr %p",
2286 p->pid, p->state, rq, p, rq->curr); 2301 p->pid, p->state, rq, p, rq->curr);
2287 check_preempt_curr(rq, p); 2302 check_preempt_curr(rq, p, sync);
2288 2303
2289 p->state = TASK_RUNNING; 2304 p->state = TASK_RUNNING;
2290#ifdef CONFIG_SMP 2305#ifdef CONFIG_SMP
@@ -2419,7 +2434,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2419 trace_mark(kernel_sched_wakeup_new, 2434 trace_mark(kernel_sched_wakeup_new,
2420 "pid %d state %ld ## rq %p task %p rq->curr %p", 2435 "pid %d state %ld ## rq %p task %p rq->curr %p",
2421 p->pid, p->state, rq, p, rq->curr); 2436 p->pid, p->state, rq, p, rq->curr);
2422 check_preempt_curr(rq, p); 2437 check_preempt_curr(rq, p, 0);
2423#ifdef CONFIG_SMP 2438#ifdef CONFIG_SMP
2424 if (p->sched_class->task_wake_up) 2439 if (p->sched_class->task_wake_up)
2425 p->sched_class->task_wake_up(rq, p); 2440 p->sched_class->task_wake_up(rq, p);
@@ -2879,7 +2894,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2879 * Note that idle threads have a prio of MAX_PRIO, for this test 2894 * Note that idle threads have a prio of MAX_PRIO, for this test
2880 * to be always true for them. 2895 * to be always true for them.
2881 */ 2896 */
2882 check_preempt_curr(this_rq, p); 2897 check_preempt_curr(this_rq, p, 0);
2883} 2898}
2884 2899
2885/* 2900/*
@@ -4626,6 +4641,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4626} 4641}
4627EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4642EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4628 4643
4644/**
4645 * complete: - signals a single thread waiting on this completion
4646 * @x: holds the state of this particular completion
4647 *
4648 * This will wake up a single thread waiting on this completion. Threads will be
4649 * awakened in the same order in which they were queued.
4650 *
4651 * See also complete_all(), wait_for_completion() and related routines.
4652 */
4629void complete(struct completion *x) 4653void complete(struct completion *x)
4630{ 4654{
4631 unsigned long flags; 4655 unsigned long flags;
@@ -4637,6 +4661,12 @@ void complete(struct completion *x)
4637} 4661}
4638EXPORT_SYMBOL(complete); 4662EXPORT_SYMBOL(complete);
4639 4663
4664/**
4665 * complete_all: - signals all threads waiting on this completion
4666 * @x: holds the state of this particular completion
4667 *
4668 * This will wake up all threads waiting on this particular completion event.
4669 */
4640void complete_all(struct completion *x) 4670void complete_all(struct completion *x)
4641{ 4671{
4642 unsigned long flags; 4672 unsigned long flags;
@@ -4657,10 +4687,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4657 wait.flags |= WQ_FLAG_EXCLUSIVE; 4687 wait.flags |= WQ_FLAG_EXCLUSIVE;
4658 __add_wait_queue_tail(&x->wait, &wait); 4688 __add_wait_queue_tail(&x->wait, &wait);
4659 do { 4689 do {
4660 if ((state == TASK_INTERRUPTIBLE && 4690 if (signal_pending_state(state, current)) {
4661 signal_pending(current)) ||
4662 (state == TASK_KILLABLE &&
4663 fatal_signal_pending(current))) {
4664 timeout = -ERESTARTSYS; 4691 timeout = -ERESTARTSYS;
4665 break; 4692 break;
4666 } 4693 }
@@ -4688,12 +4715,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4688 return timeout; 4715 return timeout;
4689} 4716}
4690 4717
4718/**
4719 * wait_for_completion: - waits for completion of a task
4720 * @x: holds the state of this particular completion
4721 *
4722 * This waits to be signaled for completion of a specific task. It is NOT
4723 * interruptible and there is no timeout.
4724 *
4725 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4726 * and interrupt capability. Also see complete().
4727 */
4691void __sched wait_for_completion(struct completion *x) 4728void __sched wait_for_completion(struct completion *x)
4692{ 4729{
4693 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4730 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4694} 4731}
4695EXPORT_SYMBOL(wait_for_completion); 4732EXPORT_SYMBOL(wait_for_completion);
4696 4733
4734/**
4735 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4736 * @x: holds the state of this particular completion
4737 * @timeout: timeout value in jiffies
4738 *
4739 * This waits for either a completion of a specific task to be signaled or for a
4740 * specified timeout to expire. The timeout is in jiffies. It is not
4741 * interruptible.
4742 */
4697unsigned long __sched 4743unsigned long __sched
4698wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4744wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4699{ 4745{
@@ -4701,6 +4747,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4701} 4747}
4702EXPORT_SYMBOL(wait_for_completion_timeout); 4748EXPORT_SYMBOL(wait_for_completion_timeout);
4703 4749
4750/**
4751 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4752 * @x: holds the state of this particular completion
4753 *
4754 * This waits for completion of a specific task to be signaled. It is
4755 * interruptible.
4756 */
4704int __sched wait_for_completion_interruptible(struct completion *x) 4757int __sched wait_for_completion_interruptible(struct completion *x)
4705{ 4758{
4706 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4759 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4710,6 +4763,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4710} 4763}
4711EXPORT_SYMBOL(wait_for_completion_interruptible); 4764EXPORT_SYMBOL(wait_for_completion_interruptible);
4712 4765
4766/**
4767 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4768 * @x: holds the state of this particular completion
4769 * @timeout: timeout value in jiffies
4770 *
4771 * This waits for either a completion of a specific task to be signaled or for a
4772 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4773 */
4713unsigned long __sched 4774unsigned long __sched
4714wait_for_completion_interruptible_timeout(struct completion *x, 4775wait_for_completion_interruptible_timeout(struct completion *x,
4715 unsigned long timeout) 4776 unsigned long timeout)
@@ -4718,6 +4779,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4718} 4779}
4719EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4780EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4720 4781
4782/**
4783 * wait_for_completion_killable: - waits for completion of a task (killable)
4784 * @x: holds the state of this particular completion
4785 *
4786 * This waits to be signaled for completion of a specific task. It can be
4787 * interrupted by a kill signal.
4788 */
4721int __sched wait_for_completion_killable(struct completion *x) 4789int __sched wait_for_completion_killable(struct completion *x)
4722{ 4790{
4723 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4791 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5120,7 +5188,8 @@ recheck:
5120 * Do not allow realtime tasks into groups that have no runtime 5188 * Do not allow realtime tasks into groups that have no runtime
5121 * assigned. 5189 * assigned.
5122 */ 5190 */
5123 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5191 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5192 task_group(p)->rt_bandwidth.rt_runtime == 0)
5124 return -EPERM; 5193 return -EPERM;
5125#endif 5194#endif
5126 5195
@@ -5956,7 +6025,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5956 set_task_cpu(p, dest_cpu); 6025 set_task_cpu(p, dest_cpu);
5957 if (on_rq) { 6026 if (on_rq) {
5958 activate_task(rq_dest, p, 0); 6027 activate_task(rq_dest, p, 0);
5959 check_preempt_curr(rq_dest, p); 6028 check_preempt_curr(rq_dest, p, 0);
5960 } 6029 }
5961done: 6030done:
5962 ret = 1; 6031 ret = 1;
@@ -6281,7 +6350,7 @@ set_table_entry(struct ctl_table *entry,
6281static struct ctl_table * 6350static struct ctl_table *
6282sd_alloc_ctl_domain_table(struct sched_domain *sd) 6351sd_alloc_ctl_domain_table(struct sched_domain *sd)
6283{ 6352{
6284 struct ctl_table *table = sd_alloc_ctl_entry(12); 6353 struct ctl_table *table = sd_alloc_ctl_entry(13);
6285 6354
6286 if (table == NULL) 6355 if (table == NULL)
6287 return NULL; 6356 return NULL;
@@ -6309,7 +6378,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6309 sizeof(int), 0644, proc_dointvec_minmax); 6378 sizeof(int), 0644, proc_dointvec_minmax);
6310 set_table_entry(&table[10], "flags", &sd->flags, 6379 set_table_entry(&table[10], "flags", &sd->flags,
6311 sizeof(int), 0644, proc_dointvec_minmax); 6380 sizeof(int), 0644, proc_dointvec_minmax);
6312 /* &table[11] is terminator */ 6381 set_table_entry(&table[11], "name", sd->name,
6382 CORENAME_MAX_SIZE, 0444, proc_dostring);
6383 /* &table[12] is terminator */
6313 6384
6314 return table; 6385 return table;
6315} 6386}
@@ -7193,13 +7264,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7193 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7264 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7194 */ 7265 */
7195 7266
7267#ifdef CONFIG_SCHED_DEBUG
7268# define SD_INIT_NAME(sd, type) sd->name = #type
7269#else
7270# define SD_INIT_NAME(sd, type) do { } while (0)
7271#endif
7272
7196#define SD_INIT(sd, type) sd_init_##type(sd) 7273#define SD_INIT(sd, type) sd_init_##type(sd)
7274
7197#define SD_INIT_FUNC(type) \ 7275#define SD_INIT_FUNC(type) \
7198static noinline void sd_init_##type(struct sched_domain *sd) \ 7276static noinline void sd_init_##type(struct sched_domain *sd) \
7199{ \ 7277{ \
7200 memset(sd, 0, sizeof(*sd)); \ 7278 memset(sd, 0, sizeof(*sd)); \
7201 *sd = SD_##type##_INIT; \ 7279 *sd = SD_##type##_INIT; \
7202 sd->level = SD_LV_##type; \ 7280 sd->level = SD_LV_##type; \
7281 SD_INIT_NAME(sd, type); \
7203} 7282}
7204 7283
7205SD_INIT_FUNC(CPU) 7284SD_INIT_FUNC(CPU)
@@ -7695,24 +7774,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7695 * and partition_sched_domains() will fallback to the single partition 7774 * and partition_sched_domains() will fallback to the single partition
7696 * 'fallback_doms', it also forces the domains to be rebuilt. 7775 * 'fallback_doms', it also forces the domains to be rebuilt.
7697 * 7776 *
7777 * If doms_new==NULL it will be replaced with cpu_online_map.
7778 * ndoms_new==0 is a special case for destroying existing domains.
7779 * It will not create the default domain.
7780 *
7698 * Call with hotplug lock held 7781 * Call with hotplug lock held
7699 */ 7782 */
7700void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7783void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7701 struct sched_domain_attr *dattr_new) 7784 struct sched_domain_attr *dattr_new)
7702{ 7785{
7703 int i, j; 7786 int i, j, n;
7704 7787
7705 mutex_lock(&sched_domains_mutex); 7788 mutex_lock(&sched_domains_mutex);
7706 7789
7707 /* always unregister in case we don't destroy any domains */ 7790 /* always unregister in case we don't destroy any domains */
7708 unregister_sched_domain_sysctl(); 7791 unregister_sched_domain_sysctl();
7709 7792
7710 if (doms_new == NULL) 7793 n = doms_new ? ndoms_new : 0;
7711 ndoms_new = 0;
7712 7794
7713 /* Destroy deleted domains */ 7795 /* Destroy deleted domains */
7714 for (i = 0; i < ndoms_cur; i++) { 7796 for (i = 0; i < ndoms_cur; i++) {
7715 for (j = 0; j < ndoms_new; j++) { 7797 for (j = 0; j < n; j++) {
7716 if (cpus_equal(doms_cur[i], doms_new[j]) 7798 if (cpus_equal(doms_cur[i], doms_new[j])
7717 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7799 && dattrs_equal(dattr_cur, i, dattr_new, j))
7718 goto match1; 7800 goto match1;
@@ -7725,7 +7807,6 @@ match1:
7725 7807
7726 if (doms_new == NULL) { 7808 if (doms_new == NULL) {
7727 ndoms_cur = 0; 7809 ndoms_cur = 0;
7728 ndoms_new = 1;
7729 doms_new = &fallback_doms; 7810 doms_new = &fallback_doms;
7730 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7811 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7731 dattr_new = NULL; 7812 dattr_new = NULL;
@@ -7762,8 +7843,13 @@ match2:
7762int arch_reinit_sched_domains(void) 7843int arch_reinit_sched_domains(void)
7763{ 7844{
7764 get_online_cpus(); 7845 get_online_cpus();
7846
7847 /* Destroy domains first to force the rebuild */
7848 partition_sched_domains(0, NULL, NULL);
7849
7765 rebuild_sched_domains(); 7850 rebuild_sched_domains();
7766 put_online_cpus(); 7851 put_online_cpus();
7852
7767 return 0; 7853 return 0;
7768} 7854}
7769 7855
@@ -7847,7 +7933,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7847 case CPU_ONLINE_FROZEN: 7933 case CPU_ONLINE_FROZEN:
7848 case CPU_DEAD: 7934 case CPU_DEAD:
7849 case CPU_DEAD_FROZEN: 7935 case CPU_DEAD_FROZEN:
7850 partition_sched_domains(0, NULL, NULL); 7936 partition_sched_domains(1, NULL, NULL);
7851 return NOTIFY_OK; 7937 return NOTIFY_OK;
7852 7938
7853 default: 7939 default:
@@ -8234,20 +8320,25 @@ void __might_sleep(char *file, int line)
8234#ifdef in_atomic 8320#ifdef in_atomic
8235 static unsigned long prev_jiffy; /* ratelimiting */ 8321 static unsigned long prev_jiffy; /* ratelimiting */
8236 8322
8237 if ((in_atomic() || irqs_disabled()) && 8323 if ((!in_atomic() && !irqs_disabled()) ||
8238 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8324 system_state != SYSTEM_RUNNING || oops_in_progress)
8239 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8325 return;
8240 return; 8326 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8241 prev_jiffy = jiffies; 8327 return;
8242 printk(KERN_ERR "BUG: sleeping function called from invalid" 8328 prev_jiffy = jiffies;
8243 " context at %s:%d\n", file, line); 8329
8244 printk("in_atomic():%d, irqs_disabled():%d\n", 8330 printk(KERN_ERR
8245 in_atomic(), irqs_disabled()); 8331 "BUG: sleeping function called from invalid context at %s:%d\n",
8246 debug_show_held_locks(current); 8332 file, line);
8247 if (irqs_disabled()) 8333 printk(KERN_ERR
8248 print_irqtrace_events(current); 8334 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8249 dump_stack(); 8335 in_atomic(), irqs_disabled(),
8250 } 8336 current->pid, current->comm);
8337
8338 debug_show_held_locks(current);
8339 if (irqs_disabled())
8340 print_irqtrace_events(current);
8341 dump_stack();
8251#endif 8342#endif
8252} 8343}
8253EXPORT_SYMBOL(__might_sleep); 8344EXPORT_SYMBOL(__might_sleep);
@@ -8745,73 +8836,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8745static unsigned long to_ratio(u64 period, u64 runtime) 8836static unsigned long to_ratio(u64 period, u64 runtime)
8746{ 8837{
8747 if (runtime == RUNTIME_INF) 8838 if (runtime == RUNTIME_INF)
8748 return 1ULL << 16; 8839 return 1ULL << 20;
8749 8840
8750 return div64_u64(runtime << 16, period); 8841 return div64_u64(runtime << 20, period);
8751} 8842}
8752 8843
8753#ifdef CONFIG_CGROUP_SCHED 8844/* Must be called with tasklist_lock held */
8754static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8845static inline int tg_has_rt_tasks(struct task_group *tg)
8755{ 8846{
8756 struct task_group *tgi, *parent = tg->parent; 8847 struct task_struct *g, *p;
8757 unsigned long total = 0;
8758 8848
8759 if (!parent) { 8849 do_each_thread(g, p) {
8760 if (global_rt_period() < period) 8850 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8761 return 0; 8851 return 1;
8852 } while_each_thread(g, p);
8762 8853
8763 return to_ratio(period, runtime) < 8854 return 0;
8764 to_ratio(global_rt_period(), global_rt_runtime()); 8855}
8765 }
8766 8856
8767 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8857struct rt_schedulable_data {
8768 return 0; 8858 struct task_group *tg;
8859 u64 rt_period;
8860 u64 rt_runtime;
8861};
8769 8862
8770 rcu_read_lock(); 8863static int tg_schedulable(struct task_group *tg, void *data)
8771 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8864{
8772 if (tgi == tg) 8865 struct rt_schedulable_data *d = data;
8773 continue; 8866 struct task_group *child;
8867 unsigned long total, sum = 0;
8868 u64 period, runtime;
8869
8870 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8871 runtime = tg->rt_bandwidth.rt_runtime;
8774 8872
8775 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8873 if (tg == d->tg) {
8776 tgi->rt_bandwidth.rt_runtime); 8874 period = d->rt_period;
8875 runtime = d->rt_runtime;
8777 } 8876 }
8778 rcu_read_unlock();
8779 8877
8780 return total + to_ratio(period, runtime) <= 8878 /*
8781 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8879 * Cannot have more runtime than the period.
8782 parent->rt_bandwidth.rt_runtime); 8880 */
8783} 8881 if (runtime > period && runtime != RUNTIME_INF)
8784#elif defined CONFIG_USER_SCHED 8882 return -EINVAL;
8785static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8786{
8787 struct task_group *tgi;
8788 unsigned long total = 0;
8789 unsigned long global_ratio =
8790 to_ratio(global_rt_period(), global_rt_runtime());
8791 8883
8792 rcu_read_lock(); 8884 /*
8793 list_for_each_entry_rcu(tgi, &task_groups, list) { 8885 * Ensure we don't starve existing RT tasks.
8794 if (tgi == tg) 8886 */
8795 continue; 8887 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8888 return -EBUSY;
8796 8889
8797 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8890 total = to_ratio(period, runtime);
8798 tgi->rt_bandwidth.rt_runtime); 8891
8892 /*
8893 * Nobody can have more than the global setting allows.
8894 */
8895 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8896 return -EINVAL;
8897
8898 /*
8899 * The sum of our children's runtime should not exceed our own.
8900 */
8901 list_for_each_entry_rcu(child, &tg->children, siblings) {
8902 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8903 runtime = child->rt_bandwidth.rt_runtime;
8904
8905 if (child == d->tg) {
8906 period = d->rt_period;
8907 runtime = d->rt_runtime;
8908 }
8909
8910 sum += to_ratio(period, runtime);
8799 } 8911 }
8800 rcu_read_unlock();
8801 8912
8802 return total + to_ratio(period, runtime) < global_ratio; 8913 if (sum > total)
8914 return -EINVAL;
8915
8916 return 0;
8803} 8917}
8804#endif
8805 8918
8806/* Must be called with tasklist_lock held */ 8919static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8807static inline int tg_has_rt_tasks(struct task_group *tg)
8808{ 8920{
8809 struct task_struct *g, *p; 8921 struct rt_schedulable_data data = {
8810 do_each_thread(g, p) { 8922 .tg = tg,
8811 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8923 .rt_period = period,
8812 return 1; 8924 .rt_runtime = runtime,
8813 } while_each_thread(g, p); 8925 };
8814 return 0; 8926
8927 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8815} 8928}
8816 8929
8817static int tg_set_bandwidth(struct task_group *tg, 8930static int tg_set_bandwidth(struct task_group *tg,
@@ -8821,14 +8934,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8821 8934
8822 mutex_lock(&rt_constraints_mutex); 8935 mutex_lock(&rt_constraints_mutex);
8823 read_lock(&tasklist_lock); 8936 read_lock(&tasklist_lock);
8824 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8937 err = __rt_schedulable(tg, rt_period, rt_runtime);
8825 err = -EBUSY; 8938 if (err)
8826 goto unlock;
8827 }
8828 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8829 err = -EINVAL;
8830 goto unlock; 8939 goto unlock;
8831 }
8832 8940
8833 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8941 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8834 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8942 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8897,16 +9005,25 @@ long sched_group_rt_period(struct task_group *tg)
8897 9005
8898static int sched_rt_global_constraints(void) 9006static int sched_rt_global_constraints(void)
8899{ 9007{
8900 struct task_group *tg = &root_task_group; 9008 u64 runtime, period;
8901 u64 rt_runtime, rt_period;
8902 int ret = 0; 9009 int ret = 0;
8903 9010
8904 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9011 if (sysctl_sched_rt_period <= 0)
8905 rt_runtime = tg->rt_bandwidth.rt_runtime; 9012 return -EINVAL;
9013
9014 runtime = global_rt_runtime();
9015 period = global_rt_period();
9016
9017 /*
9018 * Sanity check on the sysctl variables.
9019 */
9020 if (runtime > period && runtime != RUNTIME_INF)
9021 return -EINVAL;
8906 9022
8907 mutex_lock(&rt_constraints_mutex); 9023 mutex_lock(&rt_constraints_mutex);
8908 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9024 read_lock(&tasklist_lock);
8909 ret = -EINVAL; 9025 ret = __rt_schedulable(NULL, 0, 0);
9026 read_unlock(&tasklist_lock);
8910 mutex_unlock(&rt_constraints_mutex); 9027 mutex_unlock(&rt_constraints_mutex);
8911 9028
8912 return ret; 9029 return ret;
@@ -8917,6 +9034,9 @@ static int sched_rt_global_constraints(void)
8917 unsigned long flags; 9034 unsigned long flags;
8918 int i; 9035 int i;
8919 9036
9037 if (sysctl_sched_rt_period <= 0)
9038 return -EINVAL;
9039
8920 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9040 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8921 for_each_possible_cpu(i) { 9041 for_each_possible_cpu(i) {
8922 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9042 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8977,7 +9097,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8977 9097
8978 if (!cgrp->parent) { 9098 if (!cgrp->parent) {
8979 /* This is early initialization for the top cgroup */ 9099 /* This is early initialization for the top cgroup */
8980 init_task_group.css.cgroup = cgrp;
8981 return &init_task_group.css; 9100 return &init_task_group.css;
8982 } 9101 }
8983 9102
@@ -8986,9 +9105,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8986 if (IS_ERR(tg)) 9105 if (IS_ERR(tg))
8987 return ERR_PTR(-ENOMEM); 9106 return ERR_PTR(-ENOMEM);
8988 9107
8989 /* Bind the cgroup to task_group object we just created */
8990 tg->css.cgroup = cgrp;
8991
8992 return &tg->css; 9108 return &tg->css;
8993} 9109}
8994 9110