aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c519
1 files changed, 320 insertions, 199 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 13dd2db9fb2d..558e5f284269 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
55#include <linux/cpuset.h> 55#include <linux/cpuset.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 59#include <linux/seq_file.h>
59#include <linux/sysctl.h> 60#include <linux/sysctl.h>
60#include <linux/syscalls.h> 61#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
71#include <linux/debugfs.h> 72#include <linux/debugfs.h>
72#include <linux/ctype.h> 73#include <linux/ctype.h>
73#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
74 76
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
@@ -201,14 +203,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 203 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 205 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207}
208
209static inline int rt_bandwidth_enabled(void)
210{
211 return sysctl_sched_rt_runtime >= 0;
205} 212}
206 213
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 215{
209 ktime_t now; 216 ktime_t now;
210 217
211 if (rt_b->rt_runtime == RUNTIME_INF) 218 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 219 return;
213 220
214 if (hrtimer_active(&rt_b->rt_period_timer)) 221 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -221,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
221 228
222 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
223 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
224 hrtimer_start(&rt_b->rt_period_timer, 231 hrtimer_start_expires(&rt_b->rt_period_timer,
225 rt_b->rt_period_timer.expires, 232 HRTIMER_MODE_ABS);
226 HRTIMER_MODE_ABS);
227 } 233 }
228 spin_unlock(&rt_b->rt_runtime_lock); 234 spin_unlock(&rt_b->rt_runtime_lock);
229} 235}
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 304static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 305static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 306#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 307#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 308#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 309#endif /* CONFIG_USER_SCHED */
304 310
305/* task_group_lock serializes add/remove of task groups and also changes to 311/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 312 * a task group's cpu shares.
@@ -380,7 +386,6 @@ struct cfs_rq {
380 386
381 u64 exec_clock; 387 u64 exec_clock;
382 u64 min_vruntime; 388 u64 min_vruntime;
383 u64 pair_start;
384 389
385 struct rb_root tasks_timeline; 390 struct rb_root tasks_timeline;
386 struct rb_node *rb_leftmost; 391 struct rb_node *rb_leftmost;
@@ -392,9 +397,9 @@ struct cfs_rq {
392 * 'curr' points to currently running entity on this cfs_rq. 397 * 'curr' points to currently running entity on this cfs_rq.
393 * It is set to NULL otherwise (i.e when none are currently running). 398 * It is set to NULL otherwise (i.e when none are currently running).
394 */ 399 */
395 struct sched_entity *curr, *next; 400 struct sched_entity *curr, *next, *last;
396 401
397 unsigned long nr_spread_over; 402 unsigned int nr_spread_over;
398 403
399#ifdef CONFIG_FAIR_GROUP_SCHED 404#ifdef CONFIG_FAIR_GROUP_SCHED
400 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 405 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -604,9 +609,9 @@ struct rq {
604 609
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 611
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 613{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 615}
611 616
612static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -813,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
813unsigned int sysctl_sched_shares_ratelimit = 250000; 818unsigned int sysctl_sched_shares_ratelimit = 250000;
814 819
815/* 820/*
821 * Inject some fuzzyness into changing the per-cpu group shares
822 * this avoids remote rq-locks at the expense of fairness.
823 * default: 4
824 */
825unsigned int sysctl_sched_shares_thresh = 4;
826
827/*
816 * period over which we measure -rt task cpu usage in us. 828 * period over which we measure -rt task cpu usage in us.
817 * default: 1s 829 * default: 1s
818 */ 830 */
@@ -957,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 } 969 }
958} 970}
959 971
972void task_rq_unlock_wait(struct task_struct *p)
973{
974 struct rq *rq = task_rq(p);
975
976 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
977 spin_unlock_wait(&rq->lock);
978}
979
960static void __task_rq_unlock(struct rq *rq) 980static void __task_rq_unlock(struct rq *rq)
961 __releases(rq->lock) 981 __releases(rq->lock)
962{ 982{
@@ -1058,7 +1078,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1058 struct hrtimer *timer = &rq->hrtick_timer; 1078 struct hrtimer *timer = &rq->hrtick_timer;
1059 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1079 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1060 1080
1061 timer->expires = time; 1081 hrtimer_set_expires(timer, time);
1062 1082
1063 if (rq == this_rq()) { 1083 if (rq == this_rq()) {
1064 hrtimer_restart(timer); 1084 hrtimer_restart(timer);
@@ -1102,7 +1122,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1122 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1123}
1104 1124
1105static void init_hrtick(void) 1125static inline void init_hrtick(void)
1106{ 1126{
1107} 1127}
1108#endif /* CONFIG_SMP */ 1128#endif /* CONFIG_SMP */
@@ -1119,9 +1139,9 @@ static void init_rq_hrtick(struct rq *rq)
1119 1139
1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1121 rq->hrtick_timer.function = hrtick; 1141 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1143}
1124#else 1144#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1145static inline void hrtick_clear(struct rq *rq)
1126{ 1146{
1127} 1147}
@@ -1133,7 +1153,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1153static inline void init_hrtick(void)
1134{ 1154{
1135} 1155}
1136#endif 1156#endif /* CONFIG_SCHED_HRTICK */
1137 1157
1138/* 1158/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1159 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1400,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1400 update_load_sub(&rq->load, load);
1381} 1401}
1382 1402
1383#ifdef CONFIG_SMP 1403#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1404typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1405
1402/* 1406/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1407 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1408 * leaving it for the final time.
1405 */ 1409 */
1406static void 1410static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1411{
1409 struct task_group *parent, *child; 1412 struct task_group *parent, *child;
1413 int ret;
1410 1414
1411 rcu_read_lock(); 1415 rcu_read_lock();
1412 parent = &root_task_group; 1416 parent = &root_task_group;
1413down: 1417down:
1414 (*down)(parent, cpu, sd); 1418 ret = (*down)(parent, data);
1419 if (ret)
1420 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1421 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1422 parent = child;
1417 goto down; 1423 goto down;
@@ -1419,23 +1425,53 @@ down:
1419up: 1425up:
1420 continue; 1426 continue;
1421 } 1427 }
1422 (*up)(parent, cpu, sd); 1428 ret = (*up)(parent, data);
1429 if (ret)
1430 goto out_unlock;
1423 1431
1424 child = parent; 1432 child = parent;
1425 parent = parent->parent; 1433 parent = parent->parent;
1426 if (parent) 1434 if (parent)
1427 goto up; 1435 goto up;
1436out_unlock:
1428 rcu_read_unlock(); 1437 rcu_read_unlock();
1438
1439 return ret;
1429} 1440}
1430 1441
1442static int tg_nop(struct task_group *tg, void *data)
1443{
1444 return 0;
1445}
1446#endif
1447
1448#ifdef CONFIG_SMP
1449static unsigned long source_load(int cpu, int type);
1450static unsigned long target_load(int cpu, int type);
1451static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1452
1453static unsigned long cpu_avg_load_per_task(int cpu)
1454{
1455 struct rq *rq = cpu_rq(cpu);
1456
1457 if (rq->nr_running)
1458 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1459 else
1460 rq->avg_load_per_task = 0;
1461
1462 return rq->avg_load_per_task;
1463}
1464
1465#ifdef CONFIG_FAIR_GROUP_SCHED
1466
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1467static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1468
1433/* 1469/*
1434 * Calculate and set the cpu's group shares. 1470 * Calculate and set the cpu's group shares.
1435 */ 1471 */
1436static void 1472static void
1437__update_group_shares_cpu(struct task_group *tg, int cpu, 1473update_group_shares_cpu(struct task_group *tg, int cpu,
1438 unsigned long sd_shares, unsigned long sd_rq_weight) 1474 unsigned long sd_shares, unsigned long sd_rq_weight)
1439{ 1475{
1440 int boost = 0; 1476 int boost = 0;
1441 unsigned long shares; 1477 unsigned long shares;
@@ -1466,19 +1502,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1466 * 1502 *
1467 */ 1503 */
1468 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1504 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1505 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1469 1506
1470 /* 1507 if (abs(shares - tg->se[cpu]->load.weight) >
1471 * record the actual number of shares, not the boosted amount. 1508 sysctl_sched_shares_thresh) {
1472 */ 1509 struct rq *rq = cpu_rq(cpu);
1473 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1510 unsigned long flags;
1474 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1475 1511
1476 if (shares < MIN_SHARES) 1512 spin_lock_irqsave(&rq->lock, flags);
1477 shares = MIN_SHARES; 1513 /*
1478 else if (shares > MAX_SHARES) 1514 * record the actual number of shares, not the boosted amount.
1479 shares = MAX_SHARES; 1515 */
1516 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1517 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1480 1518
1481 __set_se_shares(tg->se[cpu], shares); 1519 __set_se_shares(tg->se[cpu], shares);
1520 spin_unlock_irqrestore(&rq->lock, flags);
1521 }
1482} 1522}
1483 1523
1484/* 1524/*
@@ -1486,11 +1526,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1526 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1527 * parent group depends on the shares of its child groups.
1488 */ 1528 */
1489static void 1529static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1530{
1492 unsigned long rq_weight = 0; 1531 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1532 unsigned long shares = 0;
1533 struct sched_domain *sd = data;
1494 int i; 1534 int i;
1495 1535
1496 for_each_cpu_mask(i, sd->span) { 1536 for_each_cpu_mask(i, sd->span) {
@@ -1507,14 +1547,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1507 if (!rq_weight) 1547 if (!rq_weight)
1508 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; 1548 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1509 1549
1510 for_each_cpu_mask(i, sd->span) { 1550 for_each_cpu_mask(i, sd->span)
1511 struct rq *rq = cpu_rq(i); 1551 update_group_shares_cpu(tg, i, shares, rq_weight);
1512 unsigned long flags;
1513 1552
1514 spin_lock_irqsave(&rq->lock, flags); 1553 return 0;
1515 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags);
1517 }
1518} 1554}
1519 1555
1520/* 1556/*
@@ -1522,10 +1558,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1558 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1559 * group is a fraction of its parents load.
1524 */ 1560 */
1525static void 1561static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1562{
1528 unsigned long load; 1563 unsigned long load;
1564 long cpu = (long)data;
1529 1565
1530 if (!tg->parent) { 1566 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1567 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1572,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1572 }
1537 1573
1538 tg->cfs_rq[cpu]->h_load = load; 1574 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1575
1541static void 1576 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1577}
1545 1578
1546static void update_shares(struct sched_domain *sd) 1579static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1583,7 @@ static void update_shares(struct sched_domain *sd)
1550 1583
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1584 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1585 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1586 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1587 }
1555} 1588}
1556 1589
@@ -1561,9 +1594,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1594 spin_lock(&rq->lock);
1562} 1595}
1563 1596
1564static void update_h_load(int cpu) 1597static void update_h_load(long cpu)
1565{ 1598{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1599 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1600}
1568 1601
1569#else 1602#else
@@ -1782,7 +1815,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1782 /* 1815 /*
1783 * Buddy candidates are cache hot: 1816 * Buddy candidates are cache hot:
1784 */ 1817 */
1785 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) 1818 if (sched_feat(CACHE_HOT_BUDDY) &&
1819 (&p->se == cfs_rq_of(&p->se)->next ||
1820 &p->se == cfs_rq_of(&p->se)->last))
1786 return 1; 1821 return 1;
1787 1822
1788 if (p->sched_class != &fair_sched_class) 1823 if (p->sched_class != &fair_sched_class)
@@ -1918,14 +1953,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1918 * just go back and repeat. 1953 * just go back and repeat.
1919 */ 1954 */
1920 rq = task_rq_lock(p, &flags); 1955 rq = task_rq_lock(p, &flags);
1956 trace_sched_wait_task(rq, p);
1921 running = task_running(rq, p); 1957 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1958 on_rq = p->se.on_rq;
1923 ncsw = 0; 1959 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1960 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1961 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1962 task_rq_unlock(rq, &flags);
1930 1963
1931 /* 1964 /*
@@ -2282,10 +2315,8 @@ out_activate:
2282 success = 1; 2315 success = 1;
2283 2316
2284out_running: 2317out_running:
2285 trace_mark(kernel_sched_wakeup, 2318 trace_sched_wakeup(rq, p);
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2319 check_preempt_curr(rq, p, sync);
2287 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p);
2289 2320
2290 p->state = TASK_RUNNING; 2321 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2322#ifdef CONFIG_SMP
@@ -2417,10 +2448,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2417 p->sched_class->task_new(rq, p); 2448 p->sched_class->task_new(rq, p);
2418 inc_nr_running(rq); 2449 inc_nr_running(rq);
2419 } 2450 }
2420 trace_mark(kernel_sched_wakeup_new, 2451 trace_sched_wakeup_new(rq, p);
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2452 check_preempt_curr(rq, p, 0);
2422 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p);
2424#ifdef CONFIG_SMP 2453#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2454 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2455 p->sched_class->task_wake_up(rq, p);
@@ -2592,11 +2621,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2592 struct mm_struct *mm, *oldmm; 2621 struct mm_struct *mm, *oldmm;
2593 2622
2594 prepare_task_switch(rq, prev, next); 2623 prepare_task_switch(rq, prev, next);
2595 trace_mark(kernel_sched_schedule, 2624 trace_sched_switch(rq, prev, next);
2596 "prev_pid %d next_pid %d prev_state %ld "
2597 "## rq %p prev %p next %p",
2598 prev->pid, next->pid, prev->state,
2599 rq, prev, next);
2600 mm = next->mm; 2625 mm = next->mm;
2601 oldmm = prev->active_mm; 2626 oldmm = prev->active_mm;
2602 /* 2627 /*
@@ -2836,6 +2861,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2836 || unlikely(!cpu_active(dest_cpu))) 2861 || unlikely(!cpu_active(dest_cpu)))
2837 goto out; 2862 goto out;
2838 2863
2864 trace_sched_migrate_task(rq, p, dest_cpu);
2839 /* force the process onto the specified CPU */ 2865 /* force the process onto the specified CPU */
2840 if (migrate_task(p, dest_cpu, &req)) { 2866 if (migrate_task(p, dest_cpu, &req)) {
2841 /* Need to wait for migration thread (might exit: take ref). */ 2867 /* Need to wait for migration thread (might exit: take ref). */
@@ -2880,7 +2906,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2906 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2907 * to be always true for them.
2882 */ 2908 */
2883 check_preempt_curr(this_rq, p); 2909 check_preempt_curr(this_rq, p, 0);
2884} 2910}
2885 2911
2886/* 2912/*
@@ -3329,7 +3355,7 @@ small_imbalance:
3329 } else 3355 } else
3330 this_load_per_task = cpu_avg_load_per_task(this_cpu); 3356 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3331 3357
3332 if (max_load - this_load + 2*busiest_load_per_task >= 3358 if (max_load - this_load + busiest_load_per_task >=
3333 busiest_load_per_task * imbn) { 3359 busiest_load_per_task * imbn) {
3334 *imbalance = busiest_load_per_task; 3360 *imbalance = busiest_load_per_task;
3335 return busiest; 3361 return busiest;
@@ -4037,23 +4063,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4037EXPORT_PER_CPU_SYMBOL(kstat); 4063EXPORT_PER_CPU_SYMBOL(kstat);
4038 4064
4039/* 4065/*
4040 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4066 * Return any ns on the sched_clock that have not yet been banked in
4041 * that have not yet been banked in case the task is currently running. 4067 * @p in case that task is currently running.
4042 */ 4068 */
4043unsigned long long task_sched_runtime(struct task_struct *p) 4069unsigned long long task_delta_exec(struct task_struct *p)
4044{ 4070{
4045 unsigned long flags; 4071 unsigned long flags;
4046 u64 ns, delta_exec;
4047 struct rq *rq; 4072 struct rq *rq;
4073 u64 ns = 0;
4048 4074
4049 rq = task_rq_lock(p, &flags); 4075 rq = task_rq_lock(p, &flags);
4050 ns = p->se.sum_exec_runtime; 4076
4051 if (task_current(rq, p)) { 4077 if (task_current(rq, p)) {
4078 u64 delta_exec;
4079
4052 update_rq_clock(rq); 4080 update_rq_clock(rq);
4053 delta_exec = rq->clock - p->se.exec_start; 4081 delta_exec = rq->clock - p->se.exec_start;
4054 if ((s64)delta_exec > 0) 4082 if ((s64)delta_exec > 0)
4055 ns += delta_exec; 4083 ns = delta_exec;
4056 } 4084 }
4085
4057 task_rq_unlock(rq, &flags); 4086 task_rq_unlock(rq, &flags);
4058 4087
4059 return ns; 4088 return ns;
@@ -4070,6 +4099,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4070 cputime64_t tmp; 4099 cputime64_t tmp;
4071 4100
4072 p->utime = cputime_add(p->utime, cputime); 4101 p->utime = cputime_add(p->utime, cputime);
4102 account_group_user_time(p, cputime);
4073 4103
4074 /* Add user time to cpustat. */ 4104 /* Add user time to cpustat. */
4075 tmp = cputime_to_cputime64(cputime); 4105 tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4124,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4094 tmp = cputime_to_cputime64(cputime); 4124 tmp = cputime_to_cputime64(cputime);
4095 4125
4096 p->utime = cputime_add(p->utime, cputime); 4126 p->utime = cputime_add(p->utime, cputime);
4127 account_group_user_time(p, cputime);
4097 p->gtime = cputime_add(p->gtime, cputime); 4128 p->gtime = cputime_add(p->gtime, cputime);
4098 4129
4099 cpustat->user = cputime64_add(cpustat->user, tmp); 4130 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4160,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4129 } 4160 }
4130 4161
4131 p->stime = cputime_add(p->stime, cputime); 4162 p->stime = cputime_add(p->stime, cputime);
4163 account_group_system_time(p, cputime);
4132 4164
4133 /* Add system time to cpustat. */ 4165 /* Add system time to cpustat. */
4134 tmp = cputime_to_cputime64(cputime); 4166 tmp = cputime_to_cputime64(cputime);
@@ -4305,7 +4337,7 @@ void __kprobes sub_preempt_count(int val)
4305 /* 4337 /*
4306 * Underflow? 4338 * Underflow?
4307 */ 4339 */
4308 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4340 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4309 return; 4341 return;
4310 /* 4342 /*
4311 * Is the spinlock portion underflowing? 4343 * Is the spinlock portion underflowing?
@@ -4426,12 +4458,8 @@ need_resched_nonpreemptible:
4426 if (sched_feat(HRTICK)) 4458 if (sched_feat(HRTICK))
4427 hrtick_clear(rq); 4459 hrtick_clear(rq);
4428 4460
4429 /* 4461 spin_lock_irq(&rq->lock);
4430 * Do the rq-clock update outside the rq lock:
4431 */
4432 local_irq_disable();
4433 update_rq_clock(rq); 4462 update_rq_clock(rq);
4434 spin_lock(&rq->lock);
4435 clear_tsk_need_resched(prev); 4463 clear_tsk_need_resched(prev);
4436 4464
4437 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4465 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4627,6 +4655,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4655}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4656EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4657
4658/**
4659 * complete: - signals a single thread waiting on this completion
4660 * @x: holds the state of this particular completion
4661 *
4662 * This will wake up a single thread waiting on this completion. Threads will be
4663 * awakened in the same order in which they were queued.
4664 *
4665 * See also complete_all(), wait_for_completion() and related routines.
4666 */
4630void complete(struct completion *x) 4667void complete(struct completion *x)
4631{ 4668{
4632 unsigned long flags; 4669 unsigned long flags;
@@ -4638,6 +4675,12 @@ void complete(struct completion *x)
4638} 4675}
4639EXPORT_SYMBOL(complete); 4676EXPORT_SYMBOL(complete);
4640 4677
4678/**
4679 * complete_all: - signals all threads waiting on this completion
4680 * @x: holds the state of this particular completion
4681 *
4682 * This will wake up all threads waiting on this particular completion event.
4683 */
4641void complete_all(struct completion *x) 4684void complete_all(struct completion *x)
4642{ 4685{
4643 unsigned long flags; 4686 unsigned long flags;
@@ -4658,10 +4701,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4701 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4702 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4703 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4704 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4705 timeout = -ERESTARTSYS;
4666 break; 4706 break;
4667 } 4707 }
@@ -4689,12 +4729,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4729 return timeout;
4690} 4730}
4691 4731
4732/**
4733 * wait_for_completion: - waits for completion of a task
4734 * @x: holds the state of this particular completion
4735 *
4736 * This waits to be signaled for completion of a specific task. It is NOT
4737 * interruptible and there is no timeout.
4738 *
4739 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4740 * and interrupt capability. Also see complete().
4741 */
4692void __sched wait_for_completion(struct completion *x) 4742void __sched wait_for_completion(struct completion *x)
4693{ 4743{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4744 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4745}
4696EXPORT_SYMBOL(wait_for_completion); 4746EXPORT_SYMBOL(wait_for_completion);
4697 4747
4748/**
4749 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4750 * @x: holds the state of this particular completion
4751 * @timeout: timeout value in jiffies
4752 *
4753 * This waits for either a completion of a specific task to be signaled or for a
4754 * specified timeout to expire. The timeout is in jiffies. It is not
4755 * interruptible.
4756 */
4698unsigned long __sched 4757unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4758wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4759{
@@ -4702,6 +4761,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4761}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4762EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4763
4764/**
4765 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4766 * @x: holds the state of this particular completion
4767 *
4768 * This waits for completion of a specific task to be signaled. It is
4769 * interruptible.
4770 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4771int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4772{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4773 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4777,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4777}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4778EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4779
4780/**
4781 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4782 * @x: holds the state of this particular completion
4783 * @timeout: timeout value in jiffies
4784 *
4785 * This waits for either a completion of a specific task to be signaled or for a
4786 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4787 */
4714unsigned long __sched 4788unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4789wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4790 unsigned long timeout)
@@ -4719,6 +4793,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4793}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4794EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4795
4796/**
4797 * wait_for_completion_killable: - waits for completion of a task (killable)
4798 * @x: holds the state of this particular completion
4799 *
4800 * This waits to be signaled for completion of a specific task. It can be
4801 * interrupted by a kill signal.
4802 */
4722int __sched wait_for_completion_killable(struct completion *x) 4803int __sched wait_for_completion_killable(struct completion *x)
4723{ 4804{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4805 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5202,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5202 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5203 * assigned.
5123 */ 5204 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5205 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5206 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5207 return -EPERM;
5126#endif 5208#endif
5127 5209
@@ -5787,6 +5869,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5787 struct rq *rq = cpu_rq(cpu); 5869 struct rq *rq = cpu_rq(cpu);
5788 unsigned long flags; 5870 unsigned long flags;
5789 5871
5872 spin_lock_irqsave(&rq->lock, flags);
5873
5790 __sched_fork(idle); 5874 __sched_fork(idle);
5791 idle->se.exec_start = sched_clock(); 5875 idle->se.exec_start = sched_clock();
5792 5876
@@ -5794,7 +5878,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5794 idle->cpus_allowed = cpumask_of_cpu(cpu); 5878 idle->cpus_allowed = cpumask_of_cpu(cpu);
5795 __set_task_cpu(idle, cpu); 5879 __set_task_cpu(idle, cpu);
5796 5880
5797 spin_lock_irqsave(&rq->lock, flags);
5798 rq->curr = rq->idle = idle; 5881 rq->curr = rq->idle = idle;
5799#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5882#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5800 idle->oncpu = 1; 5883 idle->oncpu = 1;
@@ -5957,7 +6040,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6040 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6041 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6042 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6043 check_preempt_curr(rq_dest, p, 0);
5961 } 6044 }
5962done: 6045done:
5963 ret = 1; 6046 ret = 1;
@@ -6282,7 +6365,7 @@ set_table_entry(struct ctl_table *entry,
6282static struct ctl_table * 6365static struct ctl_table *
6283sd_alloc_ctl_domain_table(struct sched_domain *sd) 6366sd_alloc_ctl_domain_table(struct sched_domain *sd)
6284{ 6367{
6285 struct ctl_table *table = sd_alloc_ctl_entry(12); 6368 struct ctl_table *table = sd_alloc_ctl_entry(13);
6286 6369
6287 if (table == NULL) 6370 if (table == NULL)
6288 return NULL; 6371 return NULL;
@@ -6310,7 +6393,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6310 sizeof(int), 0644, proc_dointvec_minmax); 6393 sizeof(int), 0644, proc_dointvec_minmax);
6311 set_table_entry(&table[10], "flags", &sd->flags, 6394 set_table_entry(&table[10], "flags", &sd->flags,
6312 sizeof(int), 0644, proc_dointvec_minmax); 6395 sizeof(int), 0644, proc_dointvec_minmax);
6313 /* &table[11] is terminator */ 6396 set_table_entry(&table[11], "name", sd->name,
6397 CORENAME_MAX_SIZE, 0444, proc_dostring);
6398 /* &table[12] is terminator */
6314 6399
6315 return table; 6400 return table;
6316} 6401}
@@ -6802,15 +6887,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6802 struct sched_domain *tmp; 6887 struct sched_domain *tmp;
6803 6888
6804 /* Remove the sched domains which do not contribute to scheduling. */ 6889 /* Remove the sched domains which do not contribute to scheduling. */
6805 for (tmp = sd; tmp; tmp = tmp->parent) { 6890 for (tmp = sd; tmp; ) {
6806 struct sched_domain *parent = tmp->parent; 6891 struct sched_domain *parent = tmp->parent;
6807 if (!parent) 6892 if (!parent)
6808 break; 6893 break;
6894
6809 if (sd_parent_degenerate(tmp, parent)) { 6895 if (sd_parent_degenerate(tmp, parent)) {
6810 tmp->parent = parent->parent; 6896 tmp->parent = parent->parent;
6811 if (parent->parent) 6897 if (parent->parent)
6812 parent->parent->child = tmp; 6898 parent->parent->child = tmp;
6813 } 6899 } else
6900 tmp = tmp->parent;
6814 } 6901 }
6815 6902
6816 if (sd && sd_degenerate(sd)) { 6903 if (sd && sd_degenerate(sd)) {
@@ -7194,13 +7281,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7194 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7281 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7195 */ 7282 */
7196 7283
7284#ifdef CONFIG_SCHED_DEBUG
7285# define SD_INIT_NAME(sd, type) sd->name = #type
7286#else
7287# define SD_INIT_NAME(sd, type) do { } while (0)
7288#endif
7289
7197#define SD_INIT(sd, type) sd_init_##type(sd) 7290#define SD_INIT(sd, type) sd_init_##type(sd)
7291
7198#define SD_INIT_FUNC(type) \ 7292#define SD_INIT_FUNC(type) \
7199static noinline void sd_init_##type(struct sched_domain *sd) \ 7293static noinline void sd_init_##type(struct sched_domain *sd) \
7200{ \ 7294{ \
7201 memset(sd, 0, sizeof(*sd)); \ 7295 memset(sd, 0, sizeof(*sd)); \
7202 *sd = SD_##type##_INIT; \ 7296 *sd = SD_##type##_INIT; \
7203 sd->level = SD_LV_##type; \ 7297 sd->level = SD_LV_##type; \
7298 SD_INIT_NAME(sd, type); \
7204} 7299}
7205 7300
7206SD_INIT_FUNC(CPU) 7301SD_INIT_FUNC(CPU)
@@ -7591,6 +7686,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7591error: 7686error:
7592 free_sched_groups(cpu_map, tmpmask); 7687 free_sched_groups(cpu_map, tmpmask);
7593 SCHED_CPUMASK_FREE((void *)allmasks); 7688 SCHED_CPUMASK_FREE((void *)allmasks);
7689 kfree(rd);
7594 return -ENOMEM; 7690 return -ENOMEM;
7595#endif 7691#endif
7596} 7692}
@@ -7692,13 +7788,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7692 * 7788 *
7693 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7789 * The passed in 'doms_new' should be kmalloc'd. This routine takes
7694 * ownership of it and will kfree it when done with it. If the caller 7790 * ownership of it and will kfree it when done with it. If the caller
7695 * failed the kmalloc call, then it can pass in doms_new == NULL, 7791 * failed the kmalloc call, then it can pass in doms_new == NULL &&
7696 * and partition_sched_domains() will fallback to the single partition 7792 * ndoms_new == 1, and partition_sched_domains() will fallback to
7697 * 'fallback_doms', it also forces the domains to be rebuilt. 7793 * the single partition 'fallback_doms', it also forces the domains
7794 * to be rebuilt.
7698 * 7795 *
7699 * If doms_new==NULL it will be replaced with cpu_online_map. 7796 * If doms_new == NULL it will be replaced with cpu_online_map.
7700 * ndoms_new==0 is a special case for destroying existing domains. 7797 * ndoms_new == 0 is a special case for destroying existing domains,
7701 * It will not create the default domain. 7798 * and it will not create the default domain.
7702 * 7799 *
7703 * Call with hotplug lock held 7800 * Call with hotplug lock held
7704 */ 7801 */
@@ -8242,20 +8339,25 @@ void __might_sleep(char *file, int line)
8242#ifdef in_atomic 8339#ifdef in_atomic
8243 static unsigned long prev_jiffy; /* ratelimiting */ 8340 static unsigned long prev_jiffy; /* ratelimiting */
8244 8341
8245 if ((in_atomic() || irqs_disabled()) && 8342 if ((!in_atomic() && !irqs_disabled()) ||
8246 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8343 system_state != SYSTEM_RUNNING || oops_in_progress)
8247 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8344 return;
8248 return; 8345 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8249 prev_jiffy = jiffies; 8346 return;
8250 printk(KERN_ERR "BUG: sleeping function called from invalid" 8347 prev_jiffy = jiffies;
8251 " context at %s:%d\n", file, line); 8348
8252 printk("in_atomic():%d, irqs_disabled():%d\n", 8349 printk(KERN_ERR
8253 in_atomic(), irqs_disabled()); 8350 "BUG: sleeping function called from invalid context at %s:%d\n",
8254 debug_show_held_locks(current); 8351 file, line);
8255 if (irqs_disabled()) 8352 printk(KERN_ERR
8256 print_irqtrace_events(current); 8353 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8257 dump_stack(); 8354 in_atomic(), irqs_disabled(),
8258 } 8355 current->pid, current->comm);
8356
8357 debug_show_held_locks(current);
8358 if (irqs_disabled())
8359 print_irqtrace_events(current);
8360 dump_stack();
8259#endif 8361#endif
8260} 8362}
8261EXPORT_SYMBOL(__might_sleep); 8363EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8855,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8753static unsigned long to_ratio(u64 period, u64 runtime) 8855static unsigned long to_ratio(u64 period, u64 runtime)
8754{ 8856{
8755 if (runtime == RUNTIME_INF) 8857 if (runtime == RUNTIME_INF)
8756 return 1ULL << 16; 8858 return 1ULL << 20;
8757 8859
8758 return div64_u64(runtime << 16, period); 8860 return div64_u64(runtime << 20, period);
8759} 8861}
8760 8862
8761#ifdef CONFIG_CGROUP_SCHED 8863/* Must be called with tasklist_lock held */
8762static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8864static inline int tg_has_rt_tasks(struct task_group *tg)
8763{ 8865{
8764 struct task_group *tgi, *parent = tg->parent; 8866 struct task_struct *g, *p;
8765 unsigned long total = 0;
8766 8867
8767 if (!parent) { 8868 do_each_thread(g, p) {
8768 if (global_rt_period() < period) 8869 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8769 return 0; 8870 return 1;
8871 } while_each_thread(g, p);
8770 8872
8771 return to_ratio(period, runtime) < 8873 return 0;
8772 to_ratio(global_rt_period(), global_rt_runtime()); 8874}
8773 }
8774 8875
8775 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8876struct rt_schedulable_data {
8776 return 0; 8877 struct task_group *tg;
8878 u64 rt_period;
8879 u64 rt_runtime;
8880};
8777 8881
8778 rcu_read_lock(); 8882static int tg_schedulable(struct task_group *tg, void *data)
8779 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8883{
8780 if (tgi == tg) 8884 struct rt_schedulable_data *d = data;
8781 continue; 8885 struct task_group *child;
8886 unsigned long total, sum = 0;
8887 u64 period, runtime;
8888
8889 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8890 runtime = tg->rt_bandwidth.rt_runtime;
8782 8891
8783 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8892 if (tg == d->tg) {
8784 tgi->rt_bandwidth.rt_runtime); 8893 period = d->rt_period;
8894 runtime = d->rt_runtime;
8785 } 8895 }
8786 rcu_read_unlock();
8787 8896
8788 return total + to_ratio(period, runtime) <= 8897 /*
8789 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8898 * Cannot have more runtime than the period.
8790 parent->rt_bandwidth.rt_runtime); 8899 */
8791} 8900 if (runtime > period && runtime != RUNTIME_INF)
8792#elif defined CONFIG_USER_SCHED 8901 return -EINVAL;
8793static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8794{
8795 struct task_group *tgi;
8796 unsigned long total = 0;
8797 unsigned long global_ratio =
8798 to_ratio(global_rt_period(), global_rt_runtime());
8799 8902
8800 rcu_read_lock(); 8903 /*
8801 list_for_each_entry_rcu(tgi, &task_groups, list) { 8904 * Ensure we don't starve existing RT tasks.
8802 if (tgi == tg) 8905 */
8803 continue; 8906 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8907 return -EBUSY;
8908
8909 total = to_ratio(period, runtime);
8804 8910
8805 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8911 /*
8806 tgi->rt_bandwidth.rt_runtime); 8912 * Nobody can have more than the global setting allows.
8913 */
8914 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8915 return -EINVAL;
8916
8917 /*
8918 * The sum of our children's runtime should not exceed our own.
8919 */
8920 list_for_each_entry_rcu(child, &tg->children, siblings) {
8921 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8922 runtime = child->rt_bandwidth.rt_runtime;
8923
8924 if (child == d->tg) {
8925 period = d->rt_period;
8926 runtime = d->rt_runtime;
8927 }
8928
8929 sum += to_ratio(period, runtime);
8807 } 8930 }
8808 rcu_read_unlock();
8809 8931
8810 return total + to_ratio(period, runtime) < global_ratio; 8932 if (sum > total)
8933 return -EINVAL;
8934
8935 return 0;
8811} 8936}
8812#endif
8813 8937
8814/* Must be called with tasklist_lock held */ 8938static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8815static inline int tg_has_rt_tasks(struct task_group *tg)
8816{ 8939{
8817 struct task_struct *g, *p; 8940 struct rt_schedulable_data data = {
8818 do_each_thread(g, p) { 8941 .tg = tg,
8819 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8942 .rt_period = period,
8820 return 1; 8943 .rt_runtime = runtime,
8821 } while_each_thread(g, p); 8944 };
8822 return 0; 8945
8946 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8823} 8947}
8824 8948
8825static int tg_set_bandwidth(struct task_group *tg, 8949static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8953,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8829 8953
8830 mutex_lock(&rt_constraints_mutex); 8954 mutex_lock(&rt_constraints_mutex);
8831 read_lock(&tasklist_lock); 8955 read_lock(&tasklist_lock);
8832 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8956 err = __rt_schedulable(tg, rt_period, rt_runtime);
8833 err = -EBUSY; 8957 if (err)
8834 goto unlock;
8835 }
8836 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8837 err = -EINVAL;
8838 goto unlock; 8958 goto unlock;
8839 }
8840 8959
8841 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8960 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8842 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8961 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,19 +9024,25 @@ long sched_group_rt_period(struct task_group *tg)
8905 9024
8906static int sched_rt_global_constraints(void) 9025static int sched_rt_global_constraints(void)
8907{ 9026{
8908 struct task_group *tg = &root_task_group; 9027 u64 runtime, period;
8909 u64 rt_runtime, rt_period;
8910 int ret = 0; 9028 int ret = 0;
8911 9029
8912 if (sysctl_sched_rt_period <= 0) 9030 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL; 9031 return -EINVAL;
8914 9032
8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9033 runtime = global_rt_runtime();
8916 rt_runtime = tg->rt_bandwidth.rt_runtime; 9034 period = global_rt_period();
9035
9036 /*
9037 * Sanity check on the sysctl variables.
9038 */
9039 if (runtime > period && runtime != RUNTIME_INF)
9040 return -EINVAL;
8917 9041
8918 mutex_lock(&rt_constraints_mutex); 9042 mutex_lock(&rt_constraints_mutex);
8919 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9043 read_lock(&tasklist_lock);
8920 ret = -EINVAL; 9044 ret = __rt_schedulable(NULL, 0, 0);
9045 read_unlock(&tasklist_lock);
8921 mutex_unlock(&rt_constraints_mutex); 9046 mutex_unlock(&rt_constraints_mutex);
8922 9047
8923 return ret; 9048 return ret;
@@ -8991,7 +9116,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8991 9116
8992 if (!cgrp->parent) { 9117 if (!cgrp->parent) {
8993 /* This is early initialization for the top cgroup */ 9118 /* This is early initialization for the top cgroup */
8994 init_task_group.css.cgroup = cgrp;
8995 return &init_task_group.css; 9119 return &init_task_group.css;
8996 } 9120 }
8997 9121
@@ -9000,9 +9124,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9000 if (IS_ERR(tg)) 9124 if (IS_ERR(tg))
9001 return ERR_PTR(-ENOMEM); 9125 return ERR_PTR(-ENOMEM);
9002 9126
9003 /* Bind the cgroup to task_group object we just created */
9004 tg->css.cgroup = cgrp;
9005
9006 return &tg->css; 9127 return &tg->css;
9007} 9128}
9008 9129