aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2008-10-23 09:24:10 -0400
committerHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2008-10-23 09:24:10 -0400
commitd9214556b11a8d18ff588e60824c12041d30f791 (patch)
tree04ab59d13961675811a55c96fb12b2b167b72318 /kernel/sched.c
parent72a1419a9d4c859a3345e4b83f8ef7d599d3818c (diff)
parente82c6106b04b85879d802bbbeaed30d9b10a92e2 (diff)
Merge branches 'boards' and 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/hskinnemoen/avr32-2.6
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c413
1 files changed, 259 insertions, 154 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962dc0aa2..d906f72b42d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h> 73#include <linux/ftrace.h>
74#include <trace/sched.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -204,11 +205,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; 205 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 206}
206 207
208static inline int rt_bandwidth_enabled(void)
209{
210 return sysctl_sched_rt_runtime >= 0;
211}
212
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 213static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 214{
209 ktime_t now; 215 ktime_t now;
210 216
211 if (rt_b->rt_runtime == RUNTIME_INF) 217 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 218 return;
213 219
214 if (hrtimer_active(&rt_b->rt_period_timer)) 220 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 304static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 305static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 306#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 307#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 308#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 309#endif /* CONFIG_USER_SCHED */
304 310
305/* task_group_lock serializes add/remove of task groups and also changes to 311/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 312 * a task group's cpu shares.
@@ -604,9 +610,9 @@ struct rq {
604 610
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 611static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 612
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 613static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 614{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 615 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 616}
611 617
612static inline int cpu_of(struct rq *rq) 618static inline int cpu_of(struct rq *rq)
@@ -1102,7 +1108,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1108 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1109}
1104 1110
1105static void init_hrtick(void) 1111static inline void init_hrtick(void)
1106{ 1112{
1107} 1113}
1108#endif /* CONFIG_SMP */ 1114#endif /* CONFIG_SMP */
@@ -1121,7 +1127,7 @@ static void init_rq_hrtick(struct rq *rq)
1121 rq->hrtick_timer.function = hrtick; 1127 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; 1128 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1129}
1124#else 1130#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1131static inline void hrtick_clear(struct rq *rq)
1126{ 1132{
1127} 1133}
@@ -1133,7 +1139,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1139static inline void init_hrtick(void)
1134{ 1140{
1135} 1141}
1136#endif 1142#endif /* CONFIG_SCHED_HRTICK */
1137 1143
1138/* 1144/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1145 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1386,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1386 update_load_sub(&rq->load, load);
1381} 1387}
1382 1388
1383#ifdef CONFIG_SMP 1389#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1390typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1391
1402/* 1392/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1393 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1394 * leaving it for the final time.
1405 */ 1395 */
1406static void 1396static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1397{
1409 struct task_group *parent, *child; 1398 struct task_group *parent, *child;
1399 int ret;
1410 1400
1411 rcu_read_lock(); 1401 rcu_read_lock();
1412 parent = &root_task_group; 1402 parent = &root_task_group;
1413down: 1403down:
1414 (*down)(parent, cpu, sd); 1404 ret = (*down)(parent, data);
1405 if (ret)
1406 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1407 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1408 parent = child;
1417 goto down; 1409 goto down;
@@ -1419,14 +1411,42 @@ down:
1419up: 1411up:
1420 continue; 1412 continue;
1421 } 1413 }
1422 (*up)(parent, cpu, sd); 1414 ret = (*up)(parent, data);
1415 if (ret)
1416 goto out_unlock;
1423 1417
1424 child = parent; 1418 child = parent;
1425 parent = parent->parent; 1419 parent = parent->parent;
1426 if (parent) 1420 if (parent)
1427 goto up; 1421 goto up;
1422out_unlock:
1428 rcu_read_unlock(); 1423 rcu_read_unlock();
1424
1425 return ret;
1426}
1427
1428static int tg_nop(struct task_group *tg, void *data)
1429{
1430 return 0;
1429} 1431}
1432#endif
1433
1434#ifdef CONFIG_SMP
1435static unsigned long source_load(int cpu, int type);
1436static unsigned long target_load(int cpu, int type);
1437static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1438
1439static unsigned long cpu_avg_load_per_task(int cpu)
1440{
1441 struct rq *rq = cpu_rq(cpu);
1442
1443 if (rq->nr_running)
1444 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1445
1446 return rq->avg_load_per_task;
1447}
1448
1449#ifdef CONFIG_FAIR_GROUP_SCHED
1430 1450
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1451static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1452
@@ -1486,11 +1506,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1506 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1507 * parent group depends on the shares of its child groups.
1488 */ 1508 */
1489static void 1509static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1510{
1492 unsigned long rq_weight = 0; 1511 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1512 unsigned long shares = 0;
1513 struct sched_domain *sd = data;
1494 int i; 1514 int i;
1495 1515
1496 for_each_cpu_mask(i, sd->span) { 1516 for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1535,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1515 __update_group_shares_cpu(tg, i, shares, rq_weight); 1535 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags); 1536 spin_unlock_irqrestore(&rq->lock, flags);
1517 } 1537 }
1538
1539 return 0;
1518} 1540}
1519 1541
1520/* 1542/*
@@ -1522,10 +1544,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1545 * group is a fraction of its parents load.
1524 */ 1546 */
1525static void 1547static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1548{
1528 unsigned long load; 1549 unsigned long load;
1550 long cpu = (long)data;
1529 1551
1530 if (!tg->parent) { 1552 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1558,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1558 }
1537 1559
1538 tg->cfs_rq[cpu]->h_load = load; 1560 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1561
1541static void 1562 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1563}
1545 1564
1546static void update_shares(struct sched_domain *sd) 1565static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1569,7 @@ static void update_shares(struct sched_domain *sd)
1550 1569
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1570 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1571 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1572 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1573 }
1555} 1574}
1556 1575
@@ -1561,9 +1580,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1580 spin_lock(&rq->lock);
1562} 1581}
1563 1582
1564static void update_h_load(int cpu) 1583static void update_h_load(long cpu)
1565{ 1584{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1585 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1586}
1568 1587
1569#else 1588#else
@@ -1918,14 +1937,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1918 * just go back and repeat. 1937 * just go back and repeat.
1919 */ 1938 */
1920 rq = task_rq_lock(p, &flags); 1939 rq = task_rq_lock(p, &flags);
1940 trace_sched_wait_task(rq, p);
1921 running = task_running(rq, p); 1941 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1942 on_rq = p->se.on_rq;
1923 ncsw = 0; 1943 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1944 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1945 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1946 task_rq_unlock(rq, &flags);
1930 1947
1931 /* 1948 /*
@@ -2282,10 +2299,8 @@ out_activate:
2282 success = 1; 2299 success = 1;
2283 2300
2284out_running: 2301out_running:
2285 trace_mark(kernel_sched_wakeup, 2302 trace_sched_wakeup(rq, p);
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2303 check_preempt_curr(rq, p, sync);
2287 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p);
2289 2304
2290 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2417,10 +2432,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2417 p->sched_class->task_new(rq, p); 2432 p->sched_class->task_new(rq, p);
2418 inc_nr_running(rq); 2433 inc_nr_running(rq);
2419 } 2434 }
2420 trace_mark(kernel_sched_wakeup_new, 2435 trace_sched_wakeup_new(rq, p);
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2436 check_preempt_curr(rq, p, 0);
2422 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p);
2424#ifdef CONFIG_SMP 2437#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2438 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2439 p->sched_class->task_wake_up(rq, p);
@@ -2592,11 +2605,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2592 struct mm_struct *mm, *oldmm; 2605 struct mm_struct *mm, *oldmm;
2593 2606
2594 prepare_task_switch(rq, prev, next); 2607 prepare_task_switch(rq, prev, next);
2595 trace_mark(kernel_sched_schedule, 2608 trace_sched_switch(rq, prev, next);
2596 "prev_pid %d next_pid %d prev_state %ld "
2597 "## rq %p prev %p next %p",
2598 prev->pid, next->pid, prev->state,
2599 rq, prev, next);
2600 mm = next->mm; 2609 mm = next->mm;
2601 oldmm = prev->active_mm; 2610 oldmm = prev->active_mm;
2602 /* 2611 /*
@@ -2836,6 +2845,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2836 || unlikely(!cpu_active(dest_cpu))) 2845 || unlikely(!cpu_active(dest_cpu)))
2837 goto out; 2846 goto out;
2838 2847
2848 trace_sched_migrate_task(rq, p, dest_cpu);
2839 /* force the process onto the specified CPU */ 2849 /* force the process onto the specified CPU */
2840 if (migrate_task(p, dest_cpu, &req)) { 2850 if (migrate_task(p, dest_cpu, &req)) {
2841 /* Need to wait for migration thread (might exit: take ref). */ 2851 /* Need to wait for migration thread (might exit: take ref). */
@@ -2880,7 +2890,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2890 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2891 * to be always true for them.
2882 */ 2892 */
2883 check_preempt_curr(this_rq, p); 2893 check_preempt_curr(this_rq, p, 0);
2884} 2894}
2885 2895
2886/* 2896/*
@@ -4037,23 +4047,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4037EXPORT_PER_CPU_SYMBOL(kstat); 4047EXPORT_PER_CPU_SYMBOL(kstat);
4038 4048
4039/* 4049/*
4040 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4050 * Return any ns on the sched_clock that have not yet been banked in
4041 * that have not yet been banked in case the task is currently running. 4051 * @p in case that task is currently running.
4042 */ 4052 */
4043unsigned long long task_sched_runtime(struct task_struct *p) 4053unsigned long long task_delta_exec(struct task_struct *p)
4044{ 4054{
4045 unsigned long flags; 4055 unsigned long flags;
4046 u64 ns, delta_exec;
4047 struct rq *rq; 4056 struct rq *rq;
4057 u64 ns = 0;
4048 4058
4049 rq = task_rq_lock(p, &flags); 4059 rq = task_rq_lock(p, &flags);
4050 ns = p->se.sum_exec_runtime; 4060
4051 if (task_current(rq, p)) { 4061 if (task_current(rq, p)) {
4062 u64 delta_exec;
4063
4052 update_rq_clock(rq); 4064 update_rq_clock(rq);
4053 delta_exec = rq->clock - p->se.exec_start; 4065 delta_exec = rq->clock - p->se.exec_start;
4054 if ((s64)delta_exec > 0) 4066 if ((s64)delta_exec > 0)
4055 ns += delta_exec; 4067 ns = delta_exec;
4056 } 4068 }
4069
4057 task_rq_unlock(rq, &flags); 4070 task_rq_unlock(rq, &flags);
4058 4071
4059 return ns; 4072 return ns;
@@ -4070,6 +4083,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4070 cputime64_t tmp; 4083 cputime64_t tmp;
4071 4084
4072 p->utime = cputime_add(p->utime, cputime); 4085 p->utime = cputime_add(p->utime, cputime);
4086 account_group_user_time(p, cputime);
4073 4087
4074 /* Add user time to cpustat. */ 4088 /* Add user time to cpustat. */
4075 tmp = cputime_to_cputime64(cputime); 4089 tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4108,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4094 tmp = cputime_to_cputime64(cputime); 4108 tmp = cputime_to_cputime64(cputime);
4095 4109
4096 p->utime = cputime_add(p->utime, cputime); 4110 p->utime = cputime_add(p->utime, cputime);
4111 account_group_user_time(p, cputime);
4097 p->gtime = cputime_add(p->gtime, cputime); 4112 p->gtime = cputime_add(p->gtime, cputime);
4098 4113
4099 cpustat->user = cputime64_add(cpustat->user, tmp); 4114 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4144,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4129 } 4144 }
4130 4145
4131 p->stime = cputime_add(p->stime, cputime); 4146 p->stime = cputime_add(p->stime, cputime);
4147 account_group_system_time(p, cputime);
4132 4148
4133 /* Add system time to cpustat. */ 4149 /* Add system time to cpustat. */
4134 tmp = cputime_to_cputime64(cputime); 4150 tmp = cputime_to_cputime64(cputime);
@@ -4170,6 +4186,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4170 4186
4171 if (p == rq->idle) { 4187 if (p == rq->idle) {
4172 p->stime = cputime_add(p->stime, steal); 4188 p->stime = cputime_add(p->stime, steal);
4189 account_group_system_time(p, steal);
4173 if (atomic_read(&rq->nr_iowait) > 0) 4190 if (atomic_read(&rq->nr_iowait) > 0)
4174 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4191 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4175 else 4192 else
@@ -4627,6 +4644,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4644}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4645EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4646
4647/**
4648 * complete: - signals a single thread waiting on this completion
4649 * @x: holds the state of this particular completion
4650 *
4651 * This will wake up a single thread waiting on this completion. Threads will be
4652 * awakened in the same order in which they were queued.
4653 *
4654 * See also complete_all(), wait_for_completion() and related routines.
4655 */
4630void complete(struct completion *x) 4656void complete(struct completion *x)
4631{ 4657{
4632 unsigned long flags; 4658 unsigned long flags;
@@ -4638,6 +4664,12 @@ void complete(struct completion *x)
4638} 4664}
4639EXPORT_SYMBOL(complete); 4665EXPORT_SYMBOL(complete);
4640 4666
4667/**
4668 * complete_all: - signals all threads waiting on this completion
4669 * @x: holds the state of this particular completion
4670 *
4671 * This will wake up all threads waiting on this particular completion event.
4672 */
4641void complete_all(struct completion *x) 4673void complete_all(struct completion *x)
4642{ 4674{
4643 unsigned long flags; 4675 unsigned long flags;
@@ -4658,10 +4690,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4690 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4691 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4692 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4693 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4694 timeout = -ERESTARTSYS;
4666 break; 4695 break;
4667 } 4696 }
@@ -4689,12 +4718,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4718 return timeout;
4690} 4719}
4691 4720
4721/**
4722 * wait_for_completion: - waits for completion of a task
4723 * @x: holds the state of this particular completion
4724 *
4725 * This waits to be signaled for completion of a specific task. It is NOT
4726 * interruptible and there is no timeout.
4727 *
4728 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4729 * and interrupt capability. Also see complete().
4730 */
4692void __sched wait_for_completion(struct completion *x) 4731void __sched wait_for_completion(struct completion *x)
4693{ 4732{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4733 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4734}
4696EXPORT_SYMBOL(wait_for_completion); 4735EXPORT_SYMBOL(wait_for_completion);
4697 4736
4737/**
4738 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4739 * @x: holds the state of this particular completion
4740 * @timeout: timeout value in jiffies
4741 *
4742 * This waits for either a completion of a specific task to be signaled or for a
4743 * specified timeout to expire. The timeout is in jiffies. It is not
4744 * interruptible.
4745 */
4698unsigned long __sched 4746unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4747wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4748{
@@ -4702,6 +4750,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4750}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4751EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4752
4753/**
4754 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4755 * @x: holds the state of this particular completion
4756 *
4757 * This waits for completion of a specific task to be signaled. It is
4758 * interruptible.
4759 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4760int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4761{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4762 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4766,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4766}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4767EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4768
4769/**
4770 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4771 * @x: holds the state of this particular completion
4772 * @timeout: timeout value in jiffies
4773 *
4774 * This waits for either a completion of a specific task to be signaled or for a
4775 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4776 */
4714unsigned long __sched 4777unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4778wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4779 unsigned long timeout)
@@ -4719,6 +4782,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4782}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4783EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4784
4785/**
4786 * wait_for_completion_killable: - waits for completion of a task (killable)
4787 * @x: holds the state of this particular completion
4788 *
4789 * This waits to be signaled for completion of a specific task. It can be
4790 * interrupted by a kill signal.
4791 */
4722int __sched wait_for_completion_killable(struct completion *x) 4792int __sched wait_for_completion_killable(struct completion *x)
4723{ 4793{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4794 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5191,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5191 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5192 * assigned.
5123 */ 5193 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5194 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5195 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5196 return -EPERM;
5126#endif 5197#endif
5127 5198
@@ -5957,7 +6028,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6028 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6029 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6030 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6031 check_preempt_curr(rq_dest, p, 0);
5961 } 6032 }
5962done: 6033done:
5963 ret = 1; 6034 ret = 1;
@@ -6282,7 +6353,7 @@ set_table_entry(struct ctl_table *entry,
6282static struct ctl_table * 6353static struct ctl_table *
6283sd_alloc_ctl_domain_table(struct sched_domain *sd) 6354sd_alloc_ctl_domain_table(struct sched_domain *sd)
6284{ 6355{
6285 struct ctl_table *table = sd_alloc_ctl_entry(12); 6356 struct ctl_table *table = sd_alloc_ctl_entry(13);
6286 6357
6287 if (table == NULL) 6358 if (table == NULL)
6288 return NULL; 6359 return NULL;
@@ -6310,7 +6381,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6310 sizeof(int), 0644, proc_dointvec_minmax); 6381 sizeof(int), 0644, proc_dointvec_minmax);
6311 set_table_entry(&table[10], "flags", &sd->flags, 6382 set_table_entry(&table[10], "flags", &sd->flags,
6312 sizeof(int), 0644, proc_dointvec_minmax); 6383 sizeof(int), 0644, proc_dointvec_minmax);
6313 /* &table[11] is terminator */ 6384 set_table_entry(&table[11], "name", sd->name,
6385 CORENAME_MAX_SIZE, 0444, proc_dostring);
6386 /* &table[12] is terminator */
6314 6387
6315 return table; 6388 return table;
6316} 6389}
@@ -7194,13 +7267,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7194 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7267 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7195 */ 7268 */
7196 7269
7270#ifdef CONFIG_SCHED_DEBUG
7271# define SD_INIT_NAME(sd, type) sd->name = #type
7272#else
7273# define SD_INIT_NAME(sd, type) do { } while (0)
7274#endif
7275
7197#define SD_INIT(sd, type) sd_init_##type(sd) 7276#define SD_INIT(sd, type) sd_init_##type(sd)
7277
7198#define SD_INIT_FUNC(type) \ 7278#define SD_INIT_FUNC(type) \
7199static noinline void sd_init_##type(struct sched_domain *sd) \ 7279static noinline void sd_init_##type(struct sched_domain *sd) \
7200{ \ 7280{ \
7201 memset(sd, 0, sizeof(*sd)); \ 7281 memset(sd, 0, sizeof(*sd)); \
7202 *sd = SD_##type##_INIT; \ 7282 *sd = SD_##type##_INIT; \
7203 sd->level = SD_LV_##type; \ 7283 sd->level = SD_LV_##type; \
7284 SD_INIT_NAME(sd, type); \
7204} 7285}
7205 7286
7206SD_INIT_FUNC(CPU) 7287SD_INIT_FUNC(CPU)
@@ -8242,20 +8323,25 @@ void __might_sleep(char *file, int line)
8242#ifdef in_atomic 8323#ifdef in_atomic
8243 static unsigned long prev_jiffy; /* ratelimiting */ 8324 static unsigned long prev_jiffy; /* ratelimiting */
8244 8325
8245 if ((in_atomic() || irqs_disabled()) && 8326 if ((!in_atomic() && !irqs_disabled()) ||
8246 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8327 system_state != SYSTEM_RUNNING || oops_in_progress)
8247 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8328 return;
8248 return; 8329 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8249 prev_jiffy = jiffies; 8330 return;
8250 printk(KERN_ERR "BUG: sleeping function called from invalid" 8331 prev_jiffy = jiffies;
8251 " context at %s:%d\n", file, line); 8332
8252 printk("in_atomic():%d, irqs_disabled():%d\n", 8333 printk(KERN_ERR
8253 in_atomic(), irqs_disabled()); 8334 "BUG: sleeping function called from invalid context at %s:%d\n",
8254 debug_show_held_locks(current); 8335 file, line);
8255 if (irqs_disabled()) 8336 printk(KERN_ERR
8256 print_irqtrace_events(current); 8337 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8257 dump_stack(); 8338 in_atomic(), irqs_disabled(),
8258 } 8339 current->pid, current->comm);
8340
8341 debug_show_held_locks(current);
8342 if (irqs_disabled())
8343 print_irqtrace_events(current);
8344 dump_stack();
8259#endif 8345#endif
8260} 8346}
8261EXPORT_SYMBOL(__might_sleep); 8347EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8839,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8753static unsigned long to_ratio(u64 period, u64 runtime) 8839static unsigned long to_ratio(u64 period, u64 runtime)
8754{ 8840{
8755 if (runtime == RUNTIME_INF) 8841 if (runtime == RUNTIME_INF)
8756 return 1ULL << 16; 8842 return 1ULL << 20;
8757 8843
8758 return div64_u64(runtime << 16, period); 8844 return div64_u64(runtime << 20, period);
8759} 8845}
8760 8846
8761#ifdef CONFIG_CGROUP_SCHED 8847/* Must be called with tasklist_lock held */
8762static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8848static inline int tg_has_rt_tasks(struct task_group *tg)
8763{ 8849{
8764 struct task_group *tgi, *parent = tg->parent; 8850 struct task_struct *g, *p;
8765 unsigned long total = 0;
8766 8851
8767 if (!parent) { 8852 do_each_thread(g, p) {
8768 if (global_rt_period() < period) 8853 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8769 return 0; 8854 return 1;
8855 } while_each_thread(g, p);
8770 8856
8771 return to_ratio(period, runtime) < 8857 return 0;
8772 to_ratio(global_rt_period(), global_rt_runtime()); 8858}
8773 }
8774 8859
8775 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8860struct rt_schedulable_data {
8776 return 0; 8861 struct task_group *tg;
8862 u64 rt_period;
8863 u64 rt_runtime;
8864};
8777 8865
8778 rcu_read_lock(); 8866static int tg_schedulable(struct task_group *tg, void *data)
8779 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8867{
8780 if (tgi == tg) 8868 struct rt_schedulable_data *d = data;
8781 continue; 8869 struct task_group *child;
8870 unsigned long total, sum = 0;
8871 u64 period, runtime;
8872
8873 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8874 runtime = tg->rt_bandwidth.rt_runtime;
8782 8875
8783 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8876 if (tg == d->tg) {
8784 tgi->rt_bandwidth.rt_runtime); 8877 period = d->rt_period;
8878 runtime = d->rt_runtime;
8785 } 8879 }
8786 rcu_read_unlock();
8787 8880
8788 return total + to_ratio(period, runtime) <= 8881 /*
8789 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8882 * Cannot have more runtime than the period.
8790 parent->rt_bandwidth.rt_runtime); 8883 */
8791} 8884 if (runtime > period && runtime != RUNTIME_INF)
8792#elif defined CONFIG_USER_SCHED 8885 return -EINVAL;
8793static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8794{
8795 struct task_group *tgi;
8796 unsigned long total = 0;
8797 unsigned long global_ratio =
8798 to_ratio(global_rt_period(), global_rt_runtime());
8799 8886
8800 rcu_read_lock(); 8887 /*
8801 list_for_each_entry_rcu(tgi, &task_groups, list) { 8888 * Ensure we don't starve existing RT tasks.
8802 if (tgi == tg) 8889 */
8803 continue; 8890 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8891 return -EBUSY;
8892
8893 total = to_ratio(period, runtime);
8894
8895 /*
8896 * Nobody can have more than the global setting allows.
8897 */
8898 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8899 return -EINVAL;
8900
8901 /*
8902 * The sum of our children's runtime should not exceed our own.
8903 */
8904 list_for_each_entry_rcu(child, &tg->children, siblings) {
8905 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8906 runtime = child->rt_bandwidth.rt_runtime;
8804 8907
8805 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8908 if (child == d->tg) {
8806 tgi->rt_bandwidth.rt_runtime); 8909 period = d->rt_period;
8910 runtime = d->rt_runtime;
8911 }
8912
8913 sum += to_ratio(period, runtime);
8807 } 8914 }
8808 rcu_read_unlock();
8809 8915
8810 return total + to_ratio(period, runtime) < global_ratio; 8916 if (sum > total)
8917 return -EINVAL;
8918
8919 return 0;
8811} 8920}
8812#endif
8813 8921
8814/* Must be called with tasklist_lock held */ 8922static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8815static inline int tg_has_rt_tasks(struct task_group *tg)
8816{ 8923{
8817 struct task_struct *g, *p; 8924 struct rt_schedulable_data data = {
8818 do_each_thread(g, p) { 8925 .tg = tg,
8819 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8926 .rt_period = period,
8820 return 1; 8927 .rt_runtime = runtime,
8821 } while_each_thread(g, p); 8928 };
8822 return 0; 8929
8930 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8823} 8931}
8824 8932
8825static int tg_set_bandwidth(struct task_group *tg, 8933static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8937,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8829 8937
8830 mutex_lock(&rt_constraints_mutex); 8938 mutex_lock(&rt_constraints_mutex);
8831 read_lock(&tasklist_lock); 8939 read_lock(&tasklist_lock);
8832 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8940 err = __rt_schedulable(tg, rt_period, rt_runtime);
8833 err = -EBUSY; 8941 if (err)
8834 goto unlock; 8942 goto unlock;
8835 }
8836 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8837 err = -EINVAL;
8838 goto unlock;
8839 }
8840 8943
8841 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8944 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8842 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8945 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,19 +9008,25 @@ long sched_group_rt_period(struct task_group *tg)
8905 9008
8906static int sched_rt_global_constraints(void) 9009static int sched_rt_global_constraints(void)
8907{ 9010{
8908 struct task_group *tg = &root_task_group; 9011 u64 runtime, period;
8909 u64 rt_runtime, rt_period;
8910 int ret = 0; 9012 int ret = 0;
8911 9013
8912 if (sysctl_sched_rt_period <= 0) 9014 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL; 9015 return -EINVAL;
8914 9016
8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9017 runtime = global_rt_runtime();
8916 rt_runtime = tg->rt_bandwidth.rt_runtime; 9018 period = global_rt_period();
9019
9020 /*
9021 * Sanity check on the sysctl variables.
9022 */
9023 if (runtime > period && runtime != RUNTIME_INF)
9024 return -EINVAL;
8917 9025
8918 mutex_lock(&rt_constraints_mutex); 9026 mutex_lock(&rt_constraints_mutex);
8919 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9027 read_lock(&tasklist_lock);
8920 ret = -EINVAL; 9028 ret = __rt_schedulable(NULL, 0, 0);
9029 read_unlock(&tasklist_lock);
8921 mutex_unlock(&rt_constraints_mutex); 9030 mutex_unlock(&rt_constraints_mutex);
8922 9031
8923 return ret; 9032 return ret;
@@ -8991,7 +9100,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8991 9100
8992 if (!cgrp->parent) { 9101 if (!cgrp->parent) {
8993 /* This is early initialization for the top cgroup */ 9102 /* This is early initialization for the top cgroup */
8994 init_task_group.css.cgroup = cgrp;
8995 return &init_task_group.css; 9103 return &init_task_group.css;
8996 } 9104 }
8997 9105
@@ -9000,9 +9108,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9000 if (IS_ERR(tg)) 9108 if (IS_ERR(tg))
9001 return ERR_PTR(-ENOMEM); 9109 return ERR_PTR(-ENOMEM);
9002 9110
9003 /* Bind the cgroup to task_group object we just created */
9004 tg->css.cgroup = cgrp;
9005
9006 return &tg->css; 9111 return &tg->css;
9007} 9112}
9008 9113