diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:54:49 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:54:49 -0400 |
commit | d1a76187a5be4f89c6cb19d800cb5fb7aac735c5 (patch) | |
tree | 2fac3ffbfffc7560eeef8364b541d0d7a0057920 /kernel/sched.c | |
parent | c7e78cff6b7518212247fb20b1dc6411540dc9af (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into core/locking
Conflicts:
arch/um/include/asm/system.h
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 476 |
1 files changed, 291 insertions, 185 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index ec3bd1f398b3..0a4dc3b1300b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
58 | #include <linux/proc_fs.h> | ||
58 | #include <linux/seq_file.h> | 59 | #include <linux/seq_file.h> |
59 | #include <linux/sysctl.h> | 60 | #include <linux/sysctl.h> |
60 | #include <linux/syscalls.h> | 61 | #include <linux/syscalls.h> |
@@ -71,6 +72,7 @@ | |||
71 | #include <linux/debugfs.h> | 72 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | 74 | #include <linux/ftrace.h> |
75 | #include <trace/sched.h> | ||
74 | 76 | ||
75 | #include <asm/tlb.h> | 77 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 78 | #include <asm/irq_regs.h> |
@@ -201,14 +203,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
201 | hrtimer_init(&rt_b->rt_period_timer, | 203 | hrtimer_init(&rt_b->rt_period_timer, |
202 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 204 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
203 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 205 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 206 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; |
207 | } | ||
208 | |||
209 | static inline int rt_bandwidth_enabled(void) | ||
210 | { | ||
211 | return sysctl_sched_rt_runtime >= 0; | ||
205 | } | 212 | } |
206 | 213 | ||
207 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 214 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
208 | { | 215 | { |
209 | ktime_t now; | 216 | ktime_t now; |
210 | 217 | ||
211 | if (rt_b->rt_runtime == RUNTIME_INF) | 218 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) |
212 | return; | 219 | return; |
213 | 220 | ||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | 221 | if (hrtimer_active(&rt_b->rt_period_timer)) |
@@ -221,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
221 | 228 | ||
222 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | 229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); |
223 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | 230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); |
224 | hrtimer_start(&rt_b->rt_period_timer, | 231 | hrtimer_start_expires(&rt_b->rt_period_timer, |
225 | rt_b->rt_period_timer.expires, | 232 | HRTIMER_MODE_ABS); |
226 | HRTIMER_MODE_ABS); | ||
227 | } | 233 | } |
228 | spin_unlock(&rt_b->rt_runtime_lock); | 234 | spin_unlock(&rt_b->rt_runtime_lock); |
229 | } | 235 | } |
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 304 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 305 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
300 | #endif /* CONFIG_RT_GROUP_SCHED */ | 306 | #endif /* CONFIG_RT_GROUP_SCHED */ |
301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | 307 | #else /* !CONFIG_USER_SCHED */ |
302 | #define root_task_group init_task_group | 308 | #define root_task_group init_task_group |
303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 309 | #endif /* CONFIG_USER_SCHED */ |
304 | 310 | ||
305 | /* task_group_lock serializes add/remove of task groups and also changes to | 311 | /* task_group_lock serializes add/remove of task groups and also changes to |
306 | * a task group's cpu shares. | 312 | * a task group's cpu shares. |
@@ -604,9 +610,9 @@ struct rq { | |||
604 | 610 | ||
605 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 611 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
606 | 612 | ||
607 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 613 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) |
608 | { | 614 | { |
609 | rq->curr->sched_class->check_preempt_curr(rq, p); | 615 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); |
610 | } | 616 | } |
611 | 617 | ||
612 | static inline int cpu_of(struct rq *rq) | 618 | static inline int cpu_of(struct rq *rq) |
@@ -813,6 +819,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
813 | unsigned int sysctl_sched_shares_ratelimit = 250000; | 819 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
814 | 820 | ||
815 | /* | 821 | /* |
822 | * Inject some fuzzyness into changing the per-cpu group shares | ||
823 | * this avoids remote rq-locks at the expense of fairness. | ||
824 | * default: 4 | ||
825 | */ | ||
826 | unsigned int sysctl_sched_shares_thresh = 4; | ||
827 | |||
828 | /* | ||
816 | * period over which we measure -rt task cpu usage in us. | 829 | * period over which we measure -rt task cpu usage in us. |
817 | * default: 1s | 830 | * default: 1s |
818 | */ | 831 | */ |
@@ -1058,7 +1071,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
1058 | struct hrtimer *timer = &rq->hrtick_timer; | 1071 | struct hrtimer *timer = &rq->hrtick_timer; |
1059 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 1072 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
1060 | 1073 | ||
1061 | timer->expires = time; | 1074 | hrtimer_set_expires(timer, time); |
1062 | 1075 | ||
1063 | if (rq == this_rq()) { | 1076 | if (rq == this_rq()) { |
1064 | hrtimer_restart(timer); | 1077 | hrtimer_restart(timer); |
@@ -1087,7 +1100,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
1087 | return NOTIFY_DONE; | 1100 | return NOTIFY_DONE; |
1088 | } | 1101 | } |
1089 | 1102 | ||
1090 | static void init_hrtick(void) | 1103 | static __init void init_hrtick(void) |
1091 | { | 1104 | { |
1092 | hotcpu_notifier(hotplug_hrtick, 0); | 1105 | hotcpu_notifier(hotplug_hrtick, 0); |
1093 | } | 1106 | } |
@@ -1102,7 +1115,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
1102 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); | 1115 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); |
1103 | } | 1116 | } |
1104 | 1117 | ||
1105 | static void init_hrtick(void) | 1118 | static inline void init_hrtick(void) |
1106 | { | 1119 | { |
1107 | } | 1120 | } |
1108 | #endif /* CONFIG_SMP */ | 1121 | #endif /* CONFIG_SMP */ |
@@ -1119,9 +1132,9 @@ static void init_rq_hrtick(struct rq *rq) | |||
1119 | 1132 | ||
1120 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1133 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1121 | rq->hrtick_timer.function = hrtick; | 1134 | rq->hrtick_timer.function = hrtick; |
1122 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 1135 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; |
1123 | } | 1136 | } |
1124 | #else | 1137 | #else /* CONFIG_SCHED_HRTICK */ |
1125 | static inline void hrtick_clear(struct rq *rq) | 1138 | static inline void hrtick_clear(struct rq *rq) |
1126 | { | 1139 | { |
1127 | } | 1140 | } |
@@ -1133,7 +1146,7 @@ static inline void init_rq_hrtick(struct rq *rq) | |||
1133 | static inline void init_hrtick(void) | 1146 | static inline void init_hrtick(void) |
1134 | { | 1147 | { |
1135 | } | 1148 | } |
1136 | #endif | 1149 | #endif /* CONFIG_SCHED_HRTICK */ |
1137 | 1150 | ||
1138 | /* | 1151 | /* |
1139 | * resched_task - mark a task 'to be rescheduled now'. | 1152 | * resched_task - mark a task 'to be rescheduled now'. |
@@ -1380,38 +1393,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1380 | update_load_sub(&rq->load, load); | 1393 | update_load_sub(&rq->load, load); |
1381 | } | 1394 | } |
1382 | 1395 | ||
1383 | #ifdef CONFIG_SMP | 1396 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) |
1384 | static unsigned long source_load(int cpu, int type); | 1397 | typedef int (*tg_visitor)(struct task_group *, void *); |
1385 | static unsigned long target_load(int cpu, int type); | ||
1386 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1387 | |||
1388 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1389 | { | ||
1390 | struct rq *rq = cpu_rq(cpu); | ||
1391 | |||
1392 | if (rq->nr_running) | ||
1393 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1394 | |||
1395 | return rq->avg_load_per_task; | ||
1396 | } | ||
1397 | |||
1398 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1399 | |||
1400 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
1401 | 1398 | ||
1402 | /* | 1399 | /* |
1403 | * Iterate the full tree, calling @down when first entering a node and @up when | 1400 | * Iterate the full tree, calling @down when first entering a node and @up when |
1404 | * leaving it for the final time. | 1401 | * leaving it for the final time. |
1405 | */ | 1402 | */ |
1406 | static void | 1403 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) |
1407 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
1408 | { | 1404 | { |
1409 | struct task_group *parent, *child; | 1405 | struct task_group *parent, *child; |
1406 | int ret; | ||
1410 | 1407 | ||
1411 | rcu_read_lock(); | 1408 | rcu_read_lock(); |
1412 | parent = &root_task_group; | 1409 | parent = &root_task_group; |
1413 | down: | 1410 | down: |
1414 | (*down)(parent, cpu, sd); | 1411 | ret = (*down)(parent, data); |
1412 | if (ret) | ||
1413 | goto out_unlock; | ||
1415 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1414 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1416 | parent = child; | 1415 | parent = child; |
1417 | goto down; | 1416 | goto down; |
@@ -1419,23 +1418,51 @@ down: | |||
1419 | up: | 1418 | up: |
1420 | continue; | 1419 | continue; |
1421 | } | 1420 | } |
1422 | (*up)(parent, cpu, sd); | 1421 | ret = (*up)(parent, data); |
1422 | if (ret) | ||
1423 | goto out_unlock; | ||
1423 | 1424 | ||
1424 | child = parent; | 1425 | child = parent; |
1425 | parent = parent->parent; | 1426 | parent = parent->parent; |
1426 | if (parent) | 1427 | if (parent) |
1427 | goto up; | 1428 | goto up; |
1429 | out_unlock: | ||
1428 | rcu_read_unlock(); | 1430 | rcu_read_unlock(); |
1431 | |||
1432 | return ret; | ||
1429 | } | 1433 | } |
1430 | 1434 | ||
1435 | static int tg_nop(struct task_group *tg, void *data) | ||
1436 | { | ||
1437 | return 0; | ||
1438 | } | ||
1439 | #endif | ||
1440 | |||
1441 | #ifdef CONFIG_SMP | ||
1442 | static unsigned long source_load(int cpu, int type); | ||
1443 | static unsigned long target_load(int cpu, int type); | ||
1444 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1445 | |||
1446 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1447 | { | ||
1448 | struct rq *rq = cpu_rq(cpu); | ||
1449 | |||
1450 | if (rq->nr_running) | ||
1451 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1452 | |||
1453 | return rq->avg_load_per_task; | ||
1454 | } | ||
1455 | |||
1456 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1457 | |||
1431 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1458 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1432 | 1459 | ||
1433 | /* | 1460 | /* |
1434 | * Calculate and set the cpu's group shares. | 1461 | * Calculate and set the cpu's group shares. |
1435 | */ | 1462 | */ |
1436 | static void | 1463 | static void |
1437 | __update_group_shares_cpu(struct task_group *tg, int cpu, | 1464 | update_group_shares_cpu(struct task_group *tg, int cpu, |
1438 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1465 | unsigned long sd_shares, unsigned long sd_rq_weight) |
1439 | { | 1466 | { |
1440 | int boost = 0; | 1467 | int boost = 0; |
1441 | unsigned long shares; | 1468 | unsigned long shares; |
@@ -1466,19 +1493,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1466 | * | 1493 | * |
1467 | */ | 1494 | */ |
1468 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | 1495 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); |
1496 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1469 | 1497 | ||
1470 | /* | 1498 | if (abs(shares - tg->se[cpu]->load.weight) > |
1471 | * record the actual number of shares, not the boosted amount. | 1499 | sysctl_sched_shares_thresh) { |
1472 | */ | 1500 | struct rq *rq = cpu_rq(cpu); |
1473 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | 1501 | unsigned long flags; |
1474 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1475 | 1502 | ||
1476 | if (shares < MIN_SHARES) | 1503 | spin_lock_irqsave(&rq->lock, flags); |
1477 | shares = MIN_SHARES; | 1504 | /* |
1478 | else if (shares > MAX_SHARES) | 1505 | * record the actual number of shares, not the boosted amount. |
1479 | shares = MAX_SHARES; | 1506 | */ |
1507 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1508 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1480 | 1509 | ||
1481 | __set_se_shares(tg->se[cpu], shares); | 1510 | __set_se_shares(tg->se[cpu], shares); |
1511 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1512 | } | ||
1482 | } | 1513 | } |
1483 | 1514 | ||
1484 | /* | 1515 | /* |
@@ -1486,11 +1517,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1486 | * This needs to be done in a bottom-up fashion because the rq weight of a | 1517 | * This needs to be done in a bottom-up fashion because the rq weight of a |
1487 | * parent group depends on the shares of its child groups. | 1518 | * parent group depends on the shares of its child groups. |
1488 | */ | 1519 | */ |
1489 | static void | 1520 | static int tg_shares_up(struct task_group *tg, void *data) |
1490 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1491 | { | 1521 | { |
1492 | unsigned long rq_weight = 0; | 1522 | unsigned long rq_weight = 0; |
1493 | unsigned long shares = 0; | 1523 | unsigned long shares = 0; |
1524 | struct sched_domain *sd = data; | ||
1494 | int i; | 1525 | int i; |
1495 | 1526 | ||
1496 | for_each_cpu_mask(i, sd->span) { | 1527 | for_each_cpu_mask(i, sd->span) { |
@@ -1507,14 +1538,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1507 | if (!rq_weight) | 1538 | if (!rq_weight) |
1508 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | 1539 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; |
1509 | 1540 | ||
1510 | for_each_cpu_mask(i, sd->span) { | 1541 | for_each_cpu_mask(i, sd->span) |
1511 | struct rq *rq = cpu_rq(i); | 1542 | update_group_shares_cpu(tg, i, shares, rq_weight); |
1512 | unsigned long flags; | ||
1513 | 1543 | ||
1514 | spin_lock_irqsave(&rq->lock, flags); | 1544 | return 0; |
1515 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
1516 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1517 | } | ||
1518 | } | 1545 | } |
1519 | 1546 | ||
1520 | /* | 1547 | /* |
@@ -1522,10 +1549,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1522 | * This needs to be done in a top-down fashion because the load of a child | 1549 | * This needs to be done in a top-down fashion because the load of a child |
1523 | * group is a fraction of its parents load. | 1550 | * group is a fraction of its parents load. |
1524 | */ | 1551 | */ |
1525 | static void | 1552 | static int tg_load_down(struct task_group *tg, void *data) |
1526 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1527 | { | 1553 | { |
1528 | unsigned long load; | 1554 | unsigned long load; |
1555 | long cpu = (long)data; | ||
1529 | 1556 | ||
1530 | if (!tg->parent) { | 1557 | if (!tg->parent) { |
1531 | load = cpu_rq(cpu)->load.weight; | 1558 | load = cpu_rq(cpu)->load.weight; |
@@ -1536,11 +1563,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1536 | } | 1563 | } |
1537 | 1564 | ||
1538 | tg->cfs_rq[cpu]->h_load = load; | 1565 | tg->cfs_rq[cpu]->h_load = load; |
1539 | } | ||
1540 | 1566 | ||
1541 | static void | 1567 | return 0; |
1542 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1543 | { | ||
1544 | } | 1568 | } |
1545 | 1569 | ||
1546 | static void update_shares(struct sched_domain *sd) | 1570 | static void update_shares(struct sched_domain *sd) |
@@ -1550,7 +1574,7 @@ static void update_shares(struct sched_domain *sd) | |||
1550 | 1574 | ||
1551 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1575 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1552 | sd->last_update = now; | 1576 | sd->last_update = now; |
1553 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | 1577 | walk_tg_tree(tg_nop, tg_shares_up, sd); |
1554 | } | 1578 | } |
1555 | } | 1579 | } |
1556 | 1580 | ||
@@ -1561,9 +1585,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1561 | spin_lock(&rq->lock); | 1585 | spin_lock(&rq->lock); |
1562 | } | 1586 | } |
1563 | 1587 | ||
1564 | static void update_h_load(int cpu) | 1588 | static void update_h_load(long cpu) |
1565 | { | 1589 | { |
1566 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | 1590 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1567 | } | 1591 | } |
1568 | 1592 | ||
1569 | #else | 1593 | #else |
@@ -1918,14 +1942,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1918 | * just go back and repeat. | 1942 | * just go back and repeat. |
1919 | */ | 1943 | */ |
1920 | rq = task_rq_lock(p, &flags); | 1944 | rq = task_rq_lock(p, &flags); |
1945 | trace_sched_wait_task(rq, p); | ||
1921 | running = task_running(rq, p); | 1946 | running = task_running(rq, p); |
1922 | on_rq = p->se.on_rq; | 1947 | on_rq = p->se.on_rq; |
1923 | ncsw = 0; | 1948 | ncsw = 0; |
1924 | if (!match_state || p->state == match_state) { | 1949 | if (!match_state || p->state == match_state) |
1925 | ncsw = p->nivcsw + p->nvcsw; | 1950 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
1926 | if (unlikely(!ncsw)) | ||
1927 | ncsw = 1; | ||
1928 | } | ||
1929 | task_rq_unlock(rq, &flags); | 1951 | task_rq_unlock(rq, &flags); |
1930 | 1952 | ||
1931 | /* | 1953 | /* |
@@ -2282,10 +2304,8 @@ out_activate: | |||
2282 | success = 1; | 2304 | success = 1; |
2283 | 2305 | ||
2284 | out_running: | 2306 | out_running: |
2285 | trace_mark(kernel_sched_wakeup, | 2307 | trace_sched_wakeup(rq, p); |
2286 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2308 | check_preempt_curr(rq, p, sync); |
2287 | p->pid, p->state, rq, p, rq->curr); | ||
2288 | check_preempt_curr(rq, p); | ||
2289 | 2309 | ||
2290 | p->state = TASK_RUNNING; | 2310 | p->state = TASK_RUNNING; |
2291 | #ifdef CONFIG_SMP | 2311 | #ifdef CONFIG_SMP |
@@ -2417,10 +2437,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2417 | p->sched_class->task_new(rq, p); | 2437 | p->sched_class->task_new(rq, p); |
2418 | inc_nr_running(rq); | 2438 | inc_nr_running(rq); |
2419 | } | 2439 | } |
2420 | trace_mark(kernel_sched_wakeup_new, | 2440 | trace_sched_wakeup_new(rq, p); |
2421 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2441 | check_preempt_curr(rq, p, 0); |
2422 | p->pid, p->state, rq, p, rq->curr); | ||
2423 | check_preempt_curr(rq, p); | ||
2424 | #ifdef CONFIG_SMP | 2442 | #ifdef CONFIG_SMP |
2425 | if (p->sched_class->task_wake_up) | 2443 | if (p->sched_class->task_wake_up) |
2426 | p->sched_class->task_wake_up(rq, p); | 2444 | p->sched_class->task_wake_up(rq, p); |
@@ -2592,11 +2610,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2592 | struct mm_struct *mm, *oldmm; | 2610 | struct mm_struct *mm, *oldmm; |
2593 | 2611 | ||
2594 | prepare_task_switch(rq, prev, next); | 2612 | prepare_task_switch(rq, prev, next); |
2595 | trace_mark(kernel_sched_schedule, | 2613 | trace_sched_switch(rq, prev, next); |
2596 | "prev_pid %d next_pid %d prev_state %ld " | ||
2597 | "## rq %p prev %p next %p", | ||
2598 | prev->pid, next->pid, prev->state, | ||
2599 | rq, prev, next); | ||
2600 | mm = next->mm; | 2614 | mm = next->mm; |
2601 | oldmm = prev->active_mm; | 2615 | oldmm = prev->active_mm; |
2602 | /* | 2616 | /* |
@@ -2836,6 +2850,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) | |||
2836 | || unlikely(!cpu_active(dest_cpu))) | 2850 | || unlikely(!cpu_active(dest_cpu))) |
2837 | goto out; | 2851 | goto out; |
2838 | 2852 | ||
2853 | trace_sched_migrate_task(rq, p, dest_cpu); | ||
2839 | /* force the process onto the specified CPU */ | 2854 | /* force the process onto the specified CPU */ |
2840 | if (migrate_task(p, dest_cpu, &req)) { | 2855 | if (migrate_task(p, dest_cpu, &req)) { |
2841 | /* Need to wait for migration thread (might exit: take ref). */ | 2856 | /* Need to wait for migration thread (might exit: take ref). */ |
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2880 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2895 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2881 | * to be always true for them. | 2896 | * to be always true for them. |
2882 | */ | 2897 | */ |
2883 | check_preempt_curr(this_rq, p); | 2898 | check_preempt_curr(this_rq, p, 0); |
2884 | } | 2899 | } |
2885 | 2900 | ||
2886 | /* | 2901 | /* |
@@ -4037,23 +4052,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); | |||
4037 | EXPORT_PER_CPU_SYMBOL(kstat); | 4052 | EXPORT_PER_CPU_SYMBOL(kstat); |
4038 | 4053 | ||
4039 | /* | 4054 | /* |
4040 | * Return p->sum_exec_runtime plus any more ns on the sched_clock | 4055 | * Return any ns on the sched_clock that have not yet been banked in |
4041 | * that have not yet been banked in case the task is currently running. | 4056 | * @p in case that task is currently running. |
4042 | */ | 4057 | */ |
4043 | unsigned long long task_sched_runtime(struct task_struct *p) | 4058 | unsigned long long task_delta_exec(struct task_struct *p) |
4044 | { | 4059 | { |
4045 | unsigned long flags; | 4060 | unsigned long flags; |
4046 | u64 ns, delta_exec; | ||
4047 | struct rq *rq; | 4061 | struct rq *rq; |
4062 | u64 ns = 0; | ||
4048 | 4063 | ||
4049 | rq = task_rq_lock(p, &flags); | 4064 | rq = task_rq_lock(p, &flags); |
4050 | ns = p->se.sum_exec_runtime; | 4065 | |
4051 | if (task_current(rq, p)) { | 4066 | if (task_current(rq, p)) { |
4067 | u64 delta_exec; | ||
4068 | |||
4052 | update_rq_clock(rq); | 4069 | update_rq_clock(rq); |
4053 | delta_exec = rq->clock - p->se.exec_start; | 4070 | delta_exec = rq->clock - p->se.exec_start; |
4054 | if ((s64)delta_exec > 0) | 4071 | if ((s64)delta_exec > 0) |
4055 | ns += delta_exec; | 4072 | ns = delta_exec; |
4056 | } | 4073 | } |
4074 | |||
4057 | task_rq_unlock(rq, &flags); | 4075 | task_rq_unlock(rq, &flags); |
4058 | 4076 | ||
4059 | return ns; | 4077 | return ns; |
@@ -4070,6 +4088,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
4070 | cputime64_t tmp; | 4088 | cputime64_t tmp; |
4071 | 4089 | ||
4072 | p->utime = cputime_add(p->utime, cputime); | 4090 | p->utime = cputime_add(p->utime, cputime); |
4091 | account_group_user_time(p, cputime); | ||
4073 | 4092 | ||
4074 | /* Add user time to cpustat. */ | 4093 | /* Add user time to cpustat. */ |
4075 | tmp = cputime_to_cputime64(cputime); | 4094 | tmp = cputime_to_cputime64(cputime); |
@@ -4094,6 +4113,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
4094 | tmp = cputime_to_cputime64(cputime); | 4113 | tmp = cputime_to_cputime64(cputime); |
4095 | 4114 | ||
4096 | p->utime = cputime_add(p->utime, cputime); | 4115 | p->utime = cputime_add(p->utime, cputime); |
4116 | account_group_user_time(p, cputime); | ||
4097 | p->gtime = cputime_add(p->gtime, cputime); | 4117 | p->gtime = cputime_add(p->gtime, cputime); |
4098 | 4118 | ||
4099 | cpustat->user = cputime64_add(cpustat->user, tmp); | 4119 | cpustat->user = cputime64_add(cpustat->user, tmp); |
@@ -4129,6 +4149,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
4129 | } | 4149 | } |
4130 | 4150 | ||
4131 | p->stime = cputime_add(p->stime, cputime); | 4151 | p->stime = cputime_add(p->stime, cputime); |
4152 | account_group_system_time(p, cputime); | ||
4132 | 4153 | ||
4133 | /* Add system time to cpustat. */ | 4154 | /* Add system time to cpustat. */ |
4134 | tmp = cputime_to_cputime64(cputime); | 4155 | tmp = cputime_to_cputime64(cputime); |
@@ -4170,6 +4191,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
4170 | 4191 | ||
4171 | if (p == rq->idle) { | 4192 | if (p == rq->idle) { |
4172 | p->stime = cputime_add(p->stime, steal); | 4193 | p->stime = cputime_add(p->stime, steal); |
4194 | account_group_system_time(p, steal); | ||
4173 | if (atomic_read(&rq->nr_iowait) > 0) | 4195 | if (atomic_read(&rq->nr_iowait) > 0) |
4174 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 4196 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
4175 | else | 4197 | else |
@@ -4426,12 +4448,8 @@ need_resched_nonpreemptible: | |||
4426 | if (sched_feat(HRTICK)) | 4448 | if (sched_feat(HRTICK)) |
4427 | hrtick_clear(rq); | 4449 | hrtick_clear(rq); |
4428 | 4450 | ||
4429 | /* | 4451 | spin_lock_irq(&rq->lock); |
4430 | * Do the rq-clock update outside the rq lock: | ||
4431 | */ | ||
4432 | local_irq_disable(); | ||
4433 | update_rq_clock(rq); | 4452 | update_rq_clock(rq); |
4434 | spin_lock(&rq->lock); | ||
4435 | clear_tsk_need_resched(prev); | 4453 | clear_tsk_need_resched(prev); |
4436 | 4454 | ||
4437 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4455 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -4627,6 +4645,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | |||
4627 | } | 4645 | } |
4628 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4646 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
4629 | 4647 | ||
4648 | /** | ||
4649 | * complete: - signals a single thread waiting on this completion | ||
4650 | * @x: holds the state of this particular completion | ||
4651 | * | ||
4652 | * This will wake up a single thread waiting on this completion. Threads will be | ||
4653 | * awakened in the same order in which they were queued. | ||
4654 | * | ||
4655 | * See also complete_all(), wait_for_completion() and related routines. | ||
4656 | */ | ||
4630 | void complete(struct completion *x) | 4657 | void complete(struct completion *x) |
4631 | { | 4658 | { |
4632 | unsigned long flags; | 4659 | unsigned long flags; |
@@ -4638,6 +4665,12 @@ void complete(struct completion *x) | |||
4638 | } | 4665 | } |
4639 | EXPORT_SYMBOL(complete); | 4666 | EXPORT_SYMBOL(complete); |
4640 | 4667 | ||
4668 | /** | ||
4669 | * complete_all: - signals all threads waiting on this completion | ||
4670 | * @x: holds the state of this particular completion | ||
4671 | * | ||
4672 | * This will wake up all threads waiting on this particular completion event. | ||
4673 | */ | ||
4641 | void complete_all(struct completion *x) | 4674 | void complete_all(struct completion *x) |
4642 | { | 4675 | { |
4643 | unsigned long flags; | 4676 | unsigned long flags; |
@@ -4658,10 +4691,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4658 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4691 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
4659 | __add_wait_queue_tail(&x->wait, &wait); | 4692 | __add_wait_queue_tail(&x->wait, &wait); |
4660 | do { | 4693 | do { |
4661 | if ((state == TASK_INTERRUPTIBLE && | 4694 | if (signal_pending_state(state, current)) { |
4662 | signal_pending(current)) || | ||
4663 | (state == TASK_KILLABLE && | ||
4664 | fatal_signal_pending(current))) { | ||
4665 | timeout = -ERESTARTSYS; | 4695 | timeout = -ERESTARTSYS; |
4666 | break; | 4696 | break; |
4667 | } | 4697 | } |
@@ -4689,12 +4719,31 @@ wait_for_common(struct completion *x, long timeout, int state) | |||
4689 | return timeout; | 4719 | return timeout; |
4690 | } | 4720 | } |
4691 | 4721 | ||
4722 | /** | ||
4723 | * wait_for_completion: - waits for completion of a task | ||
4724 | * @x: holds the state of this particular completion | ||
4725 | * | ||
4726 | * This waits to be signaled for completion of a specific task. It is NOT | ||
4727 | * interruptible and there is no timeout. | ||
4728 | * | ||
4729 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
4730 | * and interrupt capability. Also see complete(). | ||
4731 | */ | ||
4692 | void __sched wait_for_completion(struct completion *x) | 4732 | void __sched wait_for_completion(struct completion *x) |
4693 | { | 4733 | { |
4694 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4734 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
4695 | } | 4735 | } |
4696 | EXPORT_SYMBOL(wait_for_completion); | 4736 | EXPORT_SYMBOL(wait_for_completion); |
4697 | 4737 | ||
4738 | /** | ||
4739 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
4740 | * @x: holds the state of this particular completion | ||
4741 | * @timeout: timeout value in jiffies | ||
4742 | * | ||
4743 | * This waits for either a completion of a specific task to be signaled or for a | ||
4744 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
4745 | * interruptible. | ||
4746 | */ | ||
4698 | unsigned long __sched | 4747 | unsigned long __sched |
4699 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4748 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
4700 | { | 4749 | { |
@@ -4702,6 +4751,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
4702 | } | 4751 | } |
4703 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4752 | EXPORT_SYMBOL(wait_for_completion_timeout); |
4704 | 4753 | ||
4754 | /** | ||
4755 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
4756 | * @x: holds the state of this particular completion | ||
4757 | * | ||
4758 | * This waits for completion of a specific task to be signaled. It is | ||
4759 | * interruptible. | ||
4760 | */ | ||
4705 | int __sched wait_for_completion_interruptible(struct completion *x) | 4761 | int __sched wait_for_completion_interruptible(struct completion *x) |
4706 | { | 4762 | { |
4707 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4763 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
@@ -4711,6 +4767,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) | |||
4711 | } | 4767 | } |
4712 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4768 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
4713 | 4769 | ||
4770 | /** | ||
4771 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
4772 | * @x: holds the state of this particular completion | ||
4773 | * @timeout: timeout value in jiffies | ||
4774 | * | ||
4775 | * This waits for either a completion of a specific task to be signaled or for a | ||
4776 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
4777 | */ | ||
4714 | unsigned long __sched | 4778 | unsigned long __sched |
4715 | wait_for_completion_interruptible_timeout(struct completion *x, | 4779 | wait_for_completion_interruptible_timeout(struct completion *x, |
4716 | unsigned long timeout) | 4780 | unsigned long timeout) |
@@ -4719,6 +4783,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
4719 | } | 4783 | } |
4720 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4784 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4721 | 4785 | ||
4786 | /** | ||
4787 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
4788 | * @x: holds the state of this particular completion | ||
4789 | * | ||
4790 | * This waits to be signaled for completion of a specific task. It can be | ||
4791 | * interrupted by a kill signal. | ||
4792 | */ | ||
4722 | int __sched wait_for_completion_killable(struct completion *x) | 4793 | int __sched wait_for_completion_killable(struct completion *x) |
4723 | { | 4794 | { |
4724 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4795 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
@@ -5121,7 +5192,8 @@ recheck: | |||
5121 | * Do not allow realtime tasks into groups that have no runtime | 5192 | * Do not allow realtime tasks into groups that have no runtime |
5122 | * assigned. | 5193 | * assigned. |
5123 | */ | 5194 | */ |
5124 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5195 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5196 | task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
5125 | return -EPERM; | 5197 | return -EPERM; |
5126 | #endif | 5198 | #endif |
5127 | 5199 | ||
@@ -5957,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5957 | set_task_cpu(p, dest_cpu); | 6029 | set_task_cpu(p, dest_cpu); |
5958 | if (on_rq) { | 6030 | if (on_rq) { |
5959 | activate_task(rq_dest, p, 0); | 6031 | activate_task(rq_dest, p, 0); |
5960 | check_preempt_curr(rq_dest, p); | 6032 | check_preempt_curr(rq_dest, p, 0); |
5961 | } | 6033 | } |
5962 | done: | 6034 | done: |
5963 | ret = 1; | 6035 | ret = 1; |
@@ -6282,7 +6354,7 @@ set_table_entry(struct ctl_table *entry, | |||
6282 | static struct ctl_table * | 6354 | static struct ctl_table * |
6283 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 6355 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
6284 | { | 6356 | { |
6285 | struct ctl_table *table = sd_alloc_ctl_entry(12); | 6357 | struct ctl_table *table = sd_alloc_ctl_entry(13); |
6286 | 6358 | ||
6287 | if (table == NULL) | 6359 | if (table == NULL) |
6288 | return NULL; | 6360 | return NULL; |
@@ -6310,7 +6382,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
6310 | sizeof(int), 0644, proc_dointvec_minmax); | 6382 | sizeof(int), 0644, proc_dointvec_minmax); |
6311 | set_table_entry(&table[10], "flags", &sd->flags, | 6383 | set_table_entry(&table[10], "flags", &sd->flags, |
6312 | sizeof(int), 0644, proc_dointvec_minmax); | 6384 | sizeof(int), 0644, proc_dointvec_minmax); |
6313 | /* &table[11] is terminator */ | 6385 | set_table_entry(&table[11], "name", sd->name, |
6386 | CORENAME_MAX_SIZE, 0444, proc_dostring); | ||
6387 | /* &table[12] is terminator */ | ||
6314 | 6388 | ||
6315 | return table; | 6389 | return table; |
6316 | } | 6390 | } |
@@ -7194,13 +7268,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7194 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 7268 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
7195 | */ | 7269 | */ |
7196 | 7270 | ||
7271 | #ifdef CONFIG_SCHED_DEBUG | ||
7272 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
7273 | #else | ||
7274 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
7275 | #endif | ||
7276 | |||
7197 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7277 | #define SD_INIT(sd, type) sd_init_##type(sd) |
7278 | |||
7198 | #define SD_INIT_FUNC(type) \ | 7279 | #define SD_INIT_FUNC(type) \ |
7199 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7280 | static noinline void sd_init_##type(struct sched_domain *sd) \ |
7200 | { \ | 7281 | { \ |
7201 | memset(sd, 0, sizeof(*sd)); \ | 7282 | memset(sd, 0, sizeof(*sd)); \ |
7202 | *sd = SD_##type##_INIT; \ | 7283 | *sd = SD_##type##_INIT; \ |
7203 | sd->level = SD_LV_##type; \ | 7284 | sd->level = SD_LV_##type; \ |
7285 | SD_INIT_NAME(sd, type); \ | ||
7204 | } | 7286 | } |
7205 | 7287 | ||
7206 | SD_INIT_FUNC(CPU) | 7288 | SD_INIT_FUNC(CPU) |
@@ -8242,20 +8324,25 @@ void __might_sleep(char *file, int line) | |||
8242 | #ifdef in_atomic | 8324 | #ifdef in_atomic |
8243 | static unsigned long prev_jiffy; /* ratelimiting */ | 8325 | static unsigned long prev_jiffy; /* ratelimiting */ |
8244 | 8326 | ||
8245 | if ((in_atomic() || irqs_disabled()) && | 8327 | if ((!in_atomic() && !irqs_disabled()) || |
8246 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 8328 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8247 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8329 | return; |
8248 | return; | 8330 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
8249 | prev_jiffy = jiffies; | 8331 | return; |
8250 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 8332 | prev_jiffy = jiffies; |
8251 | " context at %s:%d\n", file, line); | 8333 | |
8252 | printk("in_atomic():%d, irqs_disabled():%d\n", | 8334 | printk(KERN_ERR |
8253 | in_atomic(), irqs_disabled()); | 8335 | "BUG: sleeping function called from invalid context at %s:%d\n", |
8254 | debug_show_held_locks(current); | 8336 | file, line); |
8255 | if (irqs_disabled()) | 8337 | printk(KERN_ERR |
8256 | print_irqtrace_events(current); | 8338 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
8257 | dump_stack(); | 8339 | in_atomic(), irqs_disabled(), |
8258 | } | 8340 | current->pid, current->comm); |
8341 | |||
8342 | debug_show_held_locks(current); | ||
8343 | if (irqs_disabled()) | ||
8344 | print_irqtrace_events(current); | ||
8345 | dump_stack(); | ||
8259 | #endif | 8346 | #endif |
8260 | } | 8347 | } |
8261 | EXPORT_SYMBOL(__might_sleep); | 8348 | EXPORT_SYMBOL(__might_sleep); |
@@ -8753,73 +8840,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); | |||
8753 | static unsigned long to_ratio(u64 period, u64 runtime) | 8840 | static unsigned long to_ratio(u64 period, u64 runtime) |
8754 | { | 8841 | { |
8755 | if (runtime == RUNTIME_INF) | 8842 | if (runtime == RUNTIME_INF) |
8756 | return 1ULL << 16; | 8843 | return 1ULL << 20; |
8757 | 8844 | ||
8758 | return div64_u64(runtime << 16, period); | 8845 | return div64_u64(runtime << 20, period); |
8759 | } | 8846 | } |
8760 | 8847 | ||
8761 | #ifdef CONFIG_CGROUP_SCHED | 8848 | /* Must be called with tasklist_lock held */ |
8762 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8849 | static inline int tg_has_rt_tasks(struct task_group *tg) |
8763 | { | 8850 | { |
8764 | struct task_group *tgi, *parent = tg->parent; | 8851 | struct task_struct *g, *p; |
8765 | unsigned long total = 0; | ||
8766 | 8852 | ||
8767 | if (!parent) { | 8853 | do_each_thread(g, p) { |
8768 | if (global_rt_period() < period) | 8854 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
8769 | return 0; | 8855 | return 1; |
8856 | } while_each_thread(g, p); | ||
8770 | 8857 | ||
8771 | return to_ratio(period, runtime) < | 8858 | return 0; |
8772 | to_ratio(global_rt_period(), global_rt_runtime()); | 8859 | } |
8773 | } | ||
8774 | 8860 | ||
8775 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | 8861 | struct rt_schedulable_data { |
8776 | return 0; | 8862 | struct task_group *tg; |
8863 | u64 rt_period; | ||
8864 | u64 rt_runtime; | ||
8865 | }; | ||
8777 | 8866 | ||
8778 | rcu_read_lock(); | 8867 | static int tg_schedulable(struct task_group *tg, void *data) |
8779 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | 8868 | { |
8780 | if (tgi == tg) | 8869 | struct rt_schedulable_data *d = data; |
8781 | continue; | 8870 | struct task_group *child; |
8871 | unsigned long total, sum = 0; | ||
8872 | u64 period, runtime; | ||
8873 | |||
8874 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8875 | runtime = tg->rt_bandwidth.rt_runtime; | ||
8782 | 8876 | ||
8783 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8877 | if (tg == d->tg) { |
8784 | tgi->rt_bandwidth.rt_runtime); | 8878 | period = d->rt_period; |
8879 | runtime = d->rt_runtime; | ||
8785 | } | 8880 | } |
8786 | rcu_read_unlock(); | ||
8787 | 8881 | ||
8788 | return total + to_ratio(period, runtime) <= | 8882 | /* |
8789 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8883 | * Cannot have more runtime than the period. |
8790 | parent->rt_bandwidth.rt_runtime); | 8884 | */ |
8791 | } | 8885 | if (runtime > period && runtime != RUNTIME_INF) |
8792 | #elif defined CONFIG_USER_SCHED | 8886 | return -EINVAL; |
8793 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
8794 | { | ||
8795 | struct task_group *tgi; | ||
8796 | unsigned long total = 0; | ||
8797 | unsigned long global_ratio = | ||
8798 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
8799 | 8887 | ||
8800 | rcu_read_lock(); | 8888 | /* |
8801 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8889 | * Ensure we don't starve existing RT tasks. |
8802 | if (tgi == tg) | 8890 | */ |
8803 | continue; | 8891 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
8892 | return -EBUSY; | ||
8893 | |||
8894 | total = to_ratio(period, runtime); | ||
8804 | 8895 | ||
8805 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8896 | /* |
8806 | tgi->rt_bandwidth.rt_runtime); | 8897 | * Nobody can have more than the global setting allows. |
8898 | */ | ||
8899 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
8900 | return -EINVAL; | ||
8901 | |||
8902 | /* | ||
8903 | * The sum of our children's runtime should not exceed our own. | ||
8904 | */ | ||
8905 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
8906 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
8907 | runtime = child->rt_bandwidth.rt_runtime; | ||
8908 | |||
8909 | if (child == d->tg) { | ||
8910 | period = d->rt_period; | ||
8911 | runtime = d->rt_runtime; | ||
8912 | } | ||
8913 | |||
8914 | sum += to_ratio(period, runtime); | ||
8807 | } | 8915 | } |
8808 | rcu_read_unlock(); | ||
8809 | 8916 | ||
8810 | return total + to_ratio(period, runtime) < global_ratio; | 8917 | if (sum > total) |
8918 | return -EINVAL; | ||
8919 | |||
8920 | return 0; | ||
8811 | } | 8921 | } |
8812 | #endif | ||
8813 | 8922 | ||
8814 | /* Must be called with tasklist_lock held */ | 8923 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8815 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
8816 | { | 8924 | { |
8817 | struct task_struct *g, *p; | 8925 | struct rt_schedulable_data data = { |
8818 | do_each_thread(g, p) { | 8926 | .tg = tg, |
8819 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8927 | .rt_period = period, |
8820 | return 1; | 8928 | .rt_runtime = runtime, |
8821 | } while_each_thread(g, p); | 8929 | }; |
8822 | return 0; | 8930 | |
8931 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | ||
8823 | } | 8932 | } |
8824 | 8933 | ||
8825 | static int tg_set_bandwidth(struct task_group *tg, | 8934 | static int tg_set_bandwidth(struct task_group *tg, |
@@ -8829,14 +8938,9 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8829 | 8938 | ||
8830 | mutex_lock(&rt_constraints_mutex); | 8939 | mutex_lock(&rt_constraints_mutex); |
8831 | read_lock(&tasklist_lock); | 8940 | read_lock(&tasklist_lock); |
8832 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { | 8941 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
8833 | err = -EBUSY; | 8942 | if (err) |
8834 | goto unlock; | ||
8835 | } | ||
8836 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | ||
8837 | err = -EINVAL; | ||
8838 | goto unlock; | 8943 | goto unlock; |
8839 | } | ||
8840 | 8944 | ||
8841 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8945 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8842 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8946 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
@@ -8905,19 +9009,25 @@ long sched_group_rt_period(struct task_group *tg) | |||
8905 | 9009 | ||
8906 | static int sched_rt_global_constraints(void) | 9010 | static int sched_rt_global_constraints(void) |
8907 | { | 9011 | { |
8908 | struct task_group *tg = &root_task_group; | 9012 | u64 runtime, period; |
8909 | u64 rt_runtime, rt_period; | ||
8910 | int ret = 0; | 9013 | int ret = 0; |
8911 | 9014 | ||
8912 | if (sysctl_sched_rt_period <= 0) | 9015 | if (sysctl_sched_rt_period <= 0) |
8913 | return -EINVAL; | 9016 | return -EINVAL; |
8914 | 9017 | ||
8915 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 9018 | runtime = global_rt_runtime(); |
8916 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 9019 | period = global_rt_period(); |
9020 | |||
9021 | /* | ||
9022 | * Sanity check on the sysctl variables. | ||
9023 | */ | ||
9024 | if (runtime > period && runtime != RUNTIME_INF) | ||
9025 | return -EINVAL; | ||
8917 | 9026 | ||
8918 | mutex_lock(&rt_constraints_mutex); | 9027 | mutex_lock(&rt_constraints_mutex); |
8919 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) | 9028 | read_lock(&tasklist_lock); |
8920 | ret = -EINVAL; | 9029 | ret = __rt_schedulable(NULL, 0, 0); |
9030 | read_unlock(&tasklist_lock); | ||
8921 | mutex_unlock(&rt_constraints_mutex); | 9031 | mutex_unlock(&rt_constraints_mutex); |
8922 | 9032 | ||
8923 | return ret; | 9033 | return ret; |
@@ -8991,7 +9101,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8991 | 9101 | ||
8992 | if (!cgrp->parent) { | 9102 | if (!cgrp->parent) { |
8993 | /* This is early initialization for the top cgroup */ | 9103 | /* This is early initialization for the top cgroup */ |
8994 | init_task_group.css.cgroup = cgrp; | ||
8995 | return &init_task_group.css; | 9104 | return &init_task_group.css; |
8996 | } | 9105 | } |
8997 | 9106 | ||
@@ -9000,9 +9109,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9000 | if (IS_ERR(tg)) | 9109 | if (IS_ERR(tg)) |
9001 | return ERR_PTR(-ENOMEM); | 9110 | return ERR_PTR(-ENOMEM); |
9002 | 9111 | ||
9003 | /* Bind the cgroup to task_group object we just created */ | ||
9004 | tg->css.cgroup = cgrp; | ||
9005 | |||
9006 | return &tg->css; | 9112 | return &tg->css; |
9007 | } | 9113 | } |
9008 | 9114 | ||