diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 501 |
1 files changed, 311 insertions, 190 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index ad1962dc0aa2..c94baf2969e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -55,6 +55,7 @@ | |||
| 55 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
| 56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
| 57 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
| 58 | #include <linux/proc_fs.h> | ||
| 58 | #include <linux/seq_file.h> | 59 | #include <linux/seq_file.h> |
| 59 | #include <linux/sysctl.h> | 60 | #include <linux/sysctl.h> |
| 60 | #include <linux/syscalls.h> | 61 | #include <linux/syscalls.h> |
| @@ -71,6 +72,7 @@ | |||
| 71 | #include <linux/debugfs.h> | 72 | #include <linux/debugfs.h> |
| 72 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
| 73 | #include <linux/ftrace.h> | 74 | #include <linux/ftrace.h> |
| 75 | #include <trace/sched.h> | ||
| 74 | 76 | ||
| 75 | #include <asm/tlb.h> | 77 | #include <asm/tlb.h> |
| 76 | #include <asm/irq_regs.h> | 78 | #include <asm/irq_regs.h> |
| @@ -204,11 +206,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
| 204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; | 206 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; |
| 205 | } | 207 | } |
| 206 | 208 | ||
| 209 | static inline int rt_bandwidth_enabled(void) | ||
| 210 | { | ||
| 211 | return sysctl_sched_rt_runtime >= 0; | ||
| 212 | } | ||
| 213 | |||
| 207 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 214 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
| 208 | { | 215 | { |
| 209 | ktime_t now; | 216 | ktime_t now; |
| 210 | 217 | ||
| 211 | if (rt_b->rt_runtime == RUNTIME_INF) | 218 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) |
| 212 | return; | 219 | return; |
| 213 | 220 | ||
| 214 | if (hrtimer_active(&rt_b->rt_period_timer)) | 221 | if (hrtimer_active(&rt_b->rt_period_timer)) |
| @@ -221,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 221 | 228 | ||
| 222 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | 229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); |
| 223 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | 230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); |
| 224 | hrtimer_start(&rt_b->rt_period_timer, | 231 | hrtimer_start_expires(&rt_b->rt_period_timer, |
| 225 | rt_b->rt_period_timer.expires, | 232 | HRTIMER_MODE_ABS); |
| 226 | HRTIMER_MODE_ABS); | ||
| 227 | } | 233 | } |
| 228 | spin_unlock(&rt_b->rt_runtime_lock); | 234 | spin_unlock(&rt_b->rt_runtime_lock); |
| 229 | } | 235 | } |
| @@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
| 298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 304 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 305 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
| 300 | #endif /* CONFIG_RT_GROUP_SCHED */ | 306 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | 307 | #else /* !CONFIG_USER_SCHED */ |
| 302 | #define root_task_group init_task_group | 308 | #define root_task_group init_task_group |
| 303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 309 | #endif /* CONFIG_USER_SCHED */ |
| 304 | 310 | ||
| 305 | /* task_group_lock serializes add/remove of task groups and also changes to | 311 | /* task_group_lock serializes add/remove of task groups and also changes to |
| 306 | * a task group's cpu shares. | 312 | * a task group's cpu shares. |
| @@ -380,7 +386,6 @@ struct cfs_rq { | |||
| 380 | 386 | ||
| 381 | u64 exec_clock; | 387 | u64 exec_clock; |
| 382 | u64 min_vruntime; | 388 | u64 min_vruntime; |
| 383 | u64 pair_start; | ||
| 384 | 389 | ||
| 385 | struct rb_root tasks_timeline; | 390 | struct rb_root tasks_timeline; |
| 386 | struct rb_node *rb_leftmost; | 391 | struct rb_node *rb_leftmost; |
| @@ -392,9 +397,9 @@ struct cfs_rq { | |||
| 392 | * 'curr' points to currently running entity on this cfs_rq. | 397 | * 'curr' points to currently running entity on this cfs_rq. |
| 393 | * It is set to NULL otherwise (i.e when none are currently running). | 398 | * It is set to NULL otherwise (i.e when none are currently running). |
| 394 | */ | 399 | */ |
| 395 | struct sched_entity *curr, *next; | 400 | struct sched_entity *curr, *next, *last; |
| 396 | 401 | ||
| 397 | unsigned long nr_spread_over; | 402 | unsigned int nr_spread_over; |
| 398 | 403 | ||
| 399 | #ifdef CONFIG_FAIR_GROUP_SCHED | 404 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 400 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 405 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
| @@ -604,9 +609,9 @@ struct rq { | |||
| 604 | 609 | ||
| 605 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 610 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 606 | 611 | ||
| 607 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 612 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) |
| 608 | { | 613 | { |
| 609 | rq->curr->sched_class->check_preempt_curr(rq, p); | 614 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); |
| 610 | } | 615 | } |
| 611 | 616 | ||
| 612 | static inline int cpu_of(struct rq *rq) | 617 | static inline int cpu_of(struct rq *rq) |
| @@ -813,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
| 813 | unsigned int sysctl_sched_shares_ratelimit = 250000; | 818 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
| 814 | 819 | ||
| 815 | /* | 820 | /* |
| 821 | * Inject some fuzzyness into changing the per-cpu group shares | ||
| 822 | * this avoids remote rq-locks at the expense of fairness. | ||
| 823 | * default: 4 | ||
| 824 | */ | ||
| 825 | unsigned int sysctl_sched_shares_thresh = 4; | ||
| 826 | |||
| 827 | /* | ||
| 816 | * period over which we measure -rt task cpu usage in us. | 828 | * period over which we measure -rt task cpu usage in us. |
| 817 | * default: 1s | 829 | * default: 1s |
| 818 | */ | 830 | */ |
| @@ -957,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 957 | } | 969 | } |
| 958 | } | 970 | } |
| 959 | 971 | ||
| 972 | void task_rq_unlock_wait(struct task_struct *p) | ||
| 973 | { | ||
| 974 | struct rq *rq = task_rq(p); | ||
| 975 | |||
| 976 | smp_mb(); /* spin-unlock-wait is not a full memory barrier */ | ||
| 977 | spin_unlock_wait(&rq->lock); | ||
| 978 | } | ||
| 979 | |||
| 960 | static void __task_rq_unlock(struct rq *rq) | 980 | static void __task_rq_unlock(struct rq *rq) |
| 961 | __releases(rq->lock) | 981 | __releases(rq->lock) |
| 962 | { | 982 | { |
| @@ -1058,7 +1078,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
| 1058 | struct hrtimer *timer = &rq->hrtick_timer; | 1078 | struct hrtimer *timer = &rq->hrtick_timer; |
| 1059 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 1079 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
| 1060 | 1080 | ||
| 1061 | timer->expires = time; | 1081 | hrtimer_set_expires(timer, time); |
| 1062 | 1082 | ||
| 1063 | if (rq == this_rq()) { | 1083 | if (rq == this_rq()) { |
| 1064 | hrtimer_restart(timer); | 1084 | hrtimer_restart(timer); |
| @@ -1102,7 +1122,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
| 1102 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); | 1122 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); |
| 1103 | } | 1123 | } |
| 1104 | 1124 | ||
| 1105 | static void init_hrtick(void) | 1125 | static inline void init_hrtick(void) |
| 1106 | { | 1126 | { |
| 1107 | } | 1127 | } |
| 1108 | #endif /* CONFIG_SMP */ | 1128 | #endif /* CONFIG_SMP */ |
| @@ -1121,7 +1141,7 @@ static void init_rq_hrtick(struct rq *rq) | |||
| 1121 | rq->hrtick_timer.function = hrtick; | 1141 | rq->hrtick_timer.function = hrtick; |
| 1122 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; | 1142 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; |
| 1123 | } | 1143 | } |
| 1124 | #else | 1144 | #else /* CONFIG_SCHED_HRTICK */ |
| 1125 | static inline void hrtick_clear(struct rq *rq) | 1145 | static inline void hrtick_clear(struct rq *rq) |
| 1126 | { | 1146 | { |
| 1127 | } | 1147 | } |
| @@ -1133,7 +1153,7 @@ static inline void init_rq_hrtick(struct rq *rq) | |||
| 1133 | static inline void init_hrtick(void) | 1153 | static inline void init_hrtick(void) |
| 1134 | { | 1154 | { |
| 1135 | } | 1155 | } |
| 1136 | #endif | 1156 | #endif /* CONFIG_SCHED_HRTICK */ |
| 1137 | 1157 | ||
| 1138 | /* | 1158 | /* |
| 1139 | * resched_task - mark a task 'to be rescheduled now'. | 1159 | * resched_task - mark a task 'to be rescheduled now'. |
| @@ -1380,38 +1400,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
| 1380 | update_load_sub(&rq->load, load); | 1400 | update_load_sub(&rq->load, load); |
| 1381 | } | 1401 | } |
| 1382 | 1402 | ||
| 1383 | #ifdef CONFIG_SMP | 1403 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) |
| 1384 | static unsigned long source_load(int cpu, int type); | 1404 | typedef int (*tg_visitor)(struct task_group *, void *); |
| 1385 | static unsigned long target_load(int cpu, int type); | ||
| 1386 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
| 1387 | |||
| 1388 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1389 | { | ||
| 1390 | struct rq *rq = cpu_rq(cpu); | ||
| 1391 | |||
| 1392 | if (rq->nr_running) | ||
| 1393 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
| 1394 | |||
| 1395 | return rq->avg_load_per_task; | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1399 | |||
| 1400 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
| 1401 | 1405 | ||
| 1402 | /* | 1406 | /* |
| 1403 | * Iterate the full tree, calling @down when first entering a node and @up when | 1407 | * Iterate the full tree, calling @down when first entering a node and @up when |
| 1404 | * leaving it for the final time. | 1408 | * leaving it for the final time. |
| 1405 | */ | 1409 | */ |
| 1406 | static void | 1410 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) |
| 1407 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
| 1408 | { | 1411 | { |
| 1409 | struct task_group *parent, *child; | 1412 | struct task_group *parent, *child; |
| 1413 | int ret; | ||
| 1410 | 1414 | ||
| 1411 | rcu_read_lock(); | 1415 | rcu_read_lock(); |
| 1412 | parent = &root_task_group; | 1416 | parent = &root_task_group; |
| 1413 | down: | 1417 | down: |
| 1414 | (*down)(parent, cpu, sd); | 1418 | ret = (*down)(parent, data); |
| 1419 | if (ret) | ||
| 1420 | goto out_unlock; | ||
| 1415 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1421 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
| 1416 | parent = child; | 1422 | parent = child; |
| 1417 | goto down; | 1423 | goto down; |
| @@ -1419,23 +1425,53 @@ down: | |||
| 1419 | up: | 1425 | up: |
| 1420 | continue; | 1426 | continue; |
| 1421 | } | 1427 | } |
| 1422 | (*up)(parent, cpu, sd); | 1428 | ret = (*up)(parent, data); |
| 1429 | if (ret) | ||
| 1430 | goto out_unlock; | ||
| 1423 | 1431 | ||
| 1424 | child = parent; | 1432 | child = parent; |
| 1425 | parent = parent->parent; | 1433 | parent = parent->parent; |
| 1426 | if (parent) | 1434 | if (parent) |
| 1427 | goto up; | 1435 | goto up; |
| 1436 | out_unlock: | ||
| 1428 | rcu_read_unlock(); | 1437 | rcu_read_unlock(); |
| 1438 | |||
| 1439 | return ret; | ||
| 1429 | } | 1440 | } |
| 1430 | 1441 | ||
| 1442 | static int tg_nop(struct task_group *tg, void *data) | ||
| 1443 | { | ||
| 1444 | return 0; | ||
| 1445 | } | ||
| 1446 | #endif | ||
| 1447 | |||
| 1448 | #ifdef CONFIG_SMP | ||
| 1449 | static unsigned long source_load(int cpu, int type); | ||
| 1450 | static unsigned long target_load(int cpu, int type); | ||
| 1451 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
| 1452 | |||
| 1453 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1454 | { | ||
| 1455 | struct rq *rq = cpu_rq(cpu); | ||
| 1456 | |||
| 1457 | if (rq->nr_running) | ||
| 1458 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
| 1459 | else | ||
| 1460 | rq->avg_load_per_task = 0; | ||
| 1461 | |||
| 1462 | return rq->avg_load_per_task; | ||
| 1463 | } | ||
| 1464 | |||
| 1465 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1466 | |||
| 1431 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1467 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1432 | 1468 | ||
| 1433 | /* | 1469 | /* |
| 1434 | * Calculate and set the cpu's group shares. | 1470 | * Calculate and set the cpu's group shares. |
| 1435 | */ | 1471 | */ |
| 1436 | static void | 1472 | static void |
| 1437 | __update_group_shares_cpu(struct task_group *tg, int cpu, | 1473 | update_group_shares_cpu(struct task_group *tg, int cpu, |
| 1438 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1474 | unsigned long sd_shares, unsigned long sd_rq_weight) |
| 1439 | { | 1475 | { |
| 1440 | int boost = 0; | 1476 | int boost = 0; |
| 1441 | unsigned long shares; | 1477 | unsigned long shares; |
| @@ -1466,19 +1502,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1466 | * | 1502 | * |
| 1467 | */ | 1503 | */ |
| 1468 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | 1504 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); |
| 1505 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
| 1469 | 1506 | ||
| 1470 | /* | 1507 | if (abs(shares - tg->se[cpu]->load.weight) > |
| 1471 | * record the actual number of shares, not the boosted amount. | 1508 | sysctl_sched_shares_thresh) { |
| 1472 | */ | 1509 | struct rq *rq = cpu_rq(cpu); |
| 1473 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | 1510 | unsigned long flags; |
| 1474 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
| 1475 | 1511 | ||
| 1476 | if (shares < MIN_SHARES) | 1512 | spin_lock_irqsave(&rq->lock, flags); |
| 1477 | shares = MIN_SHARES; | 1513 | /* |
| 1478 | else if (shares > MAX_SHARES) | 1514 | * record the actual number of shares, not the boosted amount. |
| 1479 | shares = MAX_SHARES; | 1515 | */ |
| 1516 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
| 1517 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
| 1480 | 1518 | ||
| 1481 | __set_se_shares(tg->se[cpu], shares); | 1519 | __set_se_shares(tg->se[cpu], shares); |
| 1520 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1521 | } | ||
| 1482 | } | 1522 | } |
| 1483 | 1523 | ||
| 1484 | /* | 1524 | /* |
| @@ -1486,11 +1526,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1486 | * This needs to be done in a bottom-up fashion because the rq weight of a | 1526 | * This needs to be done in a bottom-up fashion because the rq weight of a |
| 1487 | * parent group depends on the shares of its child groups. | 1527 | * parent group depends on the shares of its child groups. |
| 1488 | */ | 1528 | */ |
| 1489 | static void | 1529 | static int tg_shares_up(struct task_group *tg, void *data) |
| 1490 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1491 | { | 1530 | { |
| 1492 | unsigned long rq_weight = 0; | 1531 | unsigned long rq_weight = 0; |
| 1493 | unsigned long shares = 0; | 1532 | unsigned long shares = 0; |
| 1533 | struct sched_domain *sd = data; | ||
| 1494 | int i; | 1534 | int i; |
| 1495 | 1535 | ||
| 1496 | for_each_cpu_mask(i, sd->span) { | 1536 | for_each_cpu_mask(i, sd->span) { |
| @@ -1507,14 +1547,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
| 1507 | if (!rq_weight) | 1547 | if (!rq_weight) |
| 1508 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | 1548 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; |
| 1509 | 1549 | ||
| 1510 | for_each_cpu_mask(i, sd->span) { | 1550 | for_each_cpu_mask(i, sd->span) |
| 1511 | struct rq *rq = cpu_rq(i); | 1551 | update_group_shares_cpu(tg, i, shares, rq_weight); |
| 1512 | unsigned long flags; | ||
| 1513 | 1552 | ||
| 1514 | spin_lock_irqsave(&rq->lock, flags); | 1553 | return 0; |
| 1515 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
| 1516 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1517 | } | ||
| 1518 | } | 1554 | } |
| 1519 | 1555 | ||
| 1520 | /* | 1556 | /* |
| @@ -1522,10 +1558,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
| 1522 | * This needs to be done in a top-down fashion because the load of a child | 1558 | * This needs to be done in a top-down fashion because the load of a child |
| 1523 | * group is a fraction of its parents load. | 1559 | * group is a fraction of its parents load. |
| 1524 | */ | 1560 | */ |
| 1525 | static void | 1561 | static int tg_load_down(struct task_group *tg, void *data) |
| 1526 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1527 | { | 1562 | { |
| 1528 | unsigned long load; | 1563 | unsigned long load; |
| 1564 | long cpu = (long)data; | ||
| 1529 | 1565 | ||
| 1530 | if (!tg->parent) { | 1566 | if (!tg->parent) { |
| 1531 | load = cpu_rq(cpu)->load.weight; | 1567 | load = cpu_rq(cpu)->load.weight; |
| @@ -1536,11 +1572,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
| 1536 | } | 1572 | } |
| 1537 | 1573 | ||
| 1538 | tg->cfs_rq[cpu]->h_load = load; | 1574 | tg->cfs_rq[cpu]->h_load = load; |
| 1539 | } | ||
| 1540 | 1575 | ||
| 1541 | static void | 1576 | return 0; |
| 1542 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1543 | { | ||
| 1544 | } | 1577 | } |
| 1545 | 1578 | ||
| 1546 | static void update_shares(struct sched_domain *sd) | 1579 | static void update_shares(struct sched_domain *sd) |
| @@ -1550,7 +1583,7 @@ static void update_shares(struct sched_domain *sd) | |||
| 1550 | 1583 | ||
| 1551 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1584 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| 1552 | sd->last_update = now; | 1585 | sd->last_update = now; |
| 1553 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | 1586 | walk_tg_tree(tg_nop, tg_shares_up, sd); |
| 1554 | } | 1587 | } |
| 1555 | } | 1588 | } |
| 1556 | 1589 | ||
| @@ -1561,9 +1594,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
| 1561 | spin_lock(&rq->lock); | 1594 | spin_lock(&rq->lock); |
| 1562 | } | 1595 | } |
| 1563 | 1596 | ||
| 1564 | static void update_h_load(int cpu) | 1597 | static void update_h_load(long cpu) |
| 1565 | { | 1598 | { |
| 1566 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | 1599 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 1567 | } | 1600 | } |
| 1568 | 1601 | ||
| 1569 | #else | 1602 | #else |
| @@ -1782,7 +1815,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1782 | /* | 1815 | /* |
| 1783 | * Buddy candidates are cache hot: | 1816 | * Buddy candidates are cache hot: |
| 1784 | */ | 1817 | */ |
| 1785 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) | 1818 | if (sched_feat(CACHE_HOT_BUDDY) && |
| 1819 | (&p->se == cfs_rq_of(&p->se)->next || | ||
| 1820 | &p->se == cfs_rq_of(&p->se)->last)) | ||
| 1786 | return 1; | 1821 | return 1; |
| 1787 | 1822 | ||
| 1788 | if (p->sched_class != &fair_sched_class) | 1823 | if (p->sched_class != &fair_sched_class) |
| @@ -1918,14 +1953,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1918 | * just go back and repeat. | 1953 | * just go back and repeat. |
| 1919 | */ | 1954 | */ |
| 1920 | rq = task_rq_lock(p, &flags); | 1955 | rq = task_rq_lock(p, &flags); |
| 1956 | trace_sched_wait_task(rq, p); | ||
| 1921 | running = task_running(rq, p); | 1957 | running = task_running(rq, p); |
| 1922 | on_rq = p->se.on_rq; | 1958 | on_rq = p->se.on_rq; |
| 1923 | ncsw = 0; | 1959 | ncsw = 0; |
| 1924 | if (!match_state || p->state == match_state) { | 1960 | if (!match_state || p->state == match_state) |
| 1925 | ncsw = p->nivcsw + p->nvcsw; | 1961 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
| 1926 | if (unlikely(!ncsw)) | ||
| 1927 | ncsw = 1; | ||
| 1928 | } | ||
| 1929 | task_rq_unlock(rq, &flags); | 1962 | task_rq_unlock(rq, &flags); |
| 1930 | 1963 | ||
| 1931 | /* | 1964 | /* |
| @@ -2282,10 +2315,8 @@ out_activate: | |||
| 2282 | success = 1; | 2315 | success = 1; |
| 2283 | 2316 | ||
| 2284 | out_running: | 2317 | out_running: |
| 2285 | trace_mark(kernel_sched_wakeup, | 2318 | trace_sched_wakeup(rq, p); |
| 2286 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2319 | check_preempt_curr(rq, p, sync); |
| 2287 | p->pid, p->state, rq, p, rq->curr); | ||
| 2288 | check_preempt_curr(rq, p); | ||
| 2289 | 2320 | ||
| 2290 | p->state = TASK_RUNNING; | 2321 | p->state = TASK_RUNNING; |
| 2291 | #ifdef CONFIG_SMP | 2322 | #ifdef CONFIG_SMP |
| @@ -2417,10 +2448,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2417 | p->sched_class->task_new(rq, p); | 2448 | p->sched_class->task_new(rq, p); |
| 2418 | inc_nr_running(rq); | 2449 | inc_nr_running(rq); |
| 2419 | } | 2450 | } |
| 2420 | trace_mark(kernel_sched_wakeup_new, | 2451 | trace_sched_wakeup_new(rq, p); |
| 2421 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2452 | check_preempt_curr(rq, p, 0); |
| 2422 | p->pid, p->state, rq, p, rq->curr); | ||
| 2423 | check_preempt_curr(rq, p); | ||
| 2424 | #ifdef CONFIG_SMP | 2453 | #ifdef CONFIG_SMP |
| 2425 | if (p->sched_class->task_wake_up) | 2454 | if (p->sched_class->task_wake_up) |
| 2426 | p->sched_class->task_wake_up(rq, p); | 2455 | p->sched_class->task_wake_up(rq, p); |
| @@ -2592,11 +2621,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2592 | struct mm_struct *mm, *oldmm; | 2621 | struct mm_struct *mm, *oldmm; |
| 2593 | 2622 | ||
| 2594 | prepare_task_switch(rq, prev, next); | 2623 | prepare_task_switch(rq, prev, next); |
| 2595 | trace_mark(kernel_sched_schedule, | 2624 | trace_sched_switch(rq, prev, next); |
| 2596 | "prev_pid %d next_pid %d prev_state %ld " | ||
| 2597 | "## rq %p prev %p next %p", | ||
| 2598 | prev->pid, next->pid, prev->state, | ||
| 2599 | rq, prev, next); | ||
| 2600 | mm = next->mm; | 2625 | mm = next->mm; |
| 2601 | oldmm = prev->active_mm; | 2626 | oldmm = prev->active_mm; |
| 2602 | /* | 2627 | /* |
| @@ -2836,6 +2861,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) | |||
| 2836 | || unlikely(!cpu_active(dest_cpu))) | 2861 | || unlikely(!cpu_active(dest_cpu))) |
| 2837 | goto out; | 2862 | goto out; |
| 2838 | 2863 | ||
| 2864 | trace_sched_migrate_task(rq, p, dest_cpu); | ||
| 2839 | /* force the process onto the specified CPU */ | 2865 | /* force the process onto the specified CPU */ |
| 2840 | if (migrate_task(p, dest_cpu, &req)) { | 2866 | if (migrate_task(p, dest_cpu, &req)) { |
| 2841 | /* Need to wait for migration thread (might exit: take ref). */ | 2867 | /* Need to wait for migration thread (might exit: take ref). */ |
| @@ -2880,7 +2906,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 2880 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2906 | * Note that idle threads have a prio of MAX_PRIO, for this test |
| 2881 | * to be always true for them. | 2907 | * to be always true for them. |
| 2882 | */ | 2908 | */ |
| 2883 | check_preempt_curr(this_rq, p); | 2909 | check_preempt_curr(this_rq, p, 0); |
| 2884 | } | 2910 | } |
| 2885 | 2911 | ||
| 2886 | /* | 2912 | /* |
| @@ -3329,7 +3355,7 @@ small_imbalance: | |||
| 3329 | } else | 3355 | } else |
| 3330 | this_load_per_task = cpu_avg_load_per_task(this_cpu); | 3356 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
| 3331 | 3357 | ||
| 3332 | if (max_load - this_load + 2*busiest_load_per_task >= | 3358 | if (max_load - this_load + busiest_load_per_task >= |
| 3333 | busiest_load_per_task * imbn) { | 3359 | busiest_load_per_task * imbn) { |
| 3334 | *imbalance = busiest_load_per_task; | 3360 | *imbalance = busiest_load_per_task; |
| 3335 | return busiest; | 3361 | return busiest; |
| @@ -4037,23 +4063,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); | |||
| 4037 | EXPORT_PER_CPU_SYMBOL(kstat); | 4063 | EXPORT_PER_CPU_SYMBOL(kstat); |
| 4038 | 4064 | ||
| 4039 | /* | 4065 | /* |
| 4040 | * Return p->sum_exec_runtime plus any more ns on the sched_clock | 4066 | * Return any ns on the sched_clock that have not yet been banked in |
| 4041 | * that have not yet been banked in case the task is currently running. | 4067 | * @p in case that task is currently running. |
| 4042 | */ | 4068 | */ |
| 4043 | unsigned long long task_sched_runtime(struct task_struct *p) | 4069 | unsigned long long task_delta_exec(struct task_struct *p) |
| 4044 | { | 4070 | { |
| 4045 | unsigned long flags; | 4071 | unsigned long flags; |
| 4046 | u64 ns, delta_exec; | ||
| 4047 | struct rq *rq; | 4072 | struct rq *rq; |
| 4073 | u64 ns = 0; | ||
| 4048 | 4074 | ||
| 4049 | rq = task_rq_lock(p, &flags); | 4075 | rq = task_rq_lock(p, &flags); |
| 4050 | ns = p->se.sum_exec_runtime; | 4076 | |
| 4051 | if (task_current(rq, p)) { | 4077 | if (task_current(rq, p)) { |
| 4078 | u64 delta_exec; | ||
| 4079 | |||
| 4052 | update_rq_clock(rq); | 4080 | update_rq_clock(rq); |
| 4053 | delta_exec = rq->clock - p->se.exec_start; | 4081 | delta_exec = rq->clock - p->se.exec_start; |
| 4054 | if ((s64)delta_exec > 0) | 4082 | if ((s64)delta_exec > 0) |
| 4055 | ns += delta_exec; | 4083 | ns = delta_exec; |
| 4056 | } | 4084 | } |
| 4085 | |||
| 4057 | task_rq_unlock(rq, &flags); | 4086 | task_rq_unlock(rq, &flags); |
| 4058 | 4087 | ||
| 4059 | return ns; | 4088 | return ns; |
| @@ -4070,6 +4099,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
| 4070 | cputime64_t tmp; | 4099 | cputime64_t tmp; |
| 4071 | 4100 | ||
| 4072 | p->utime = cputime_add(p->utime, cputime); | 4101 | p->utime = cputime_add(p->utime, cputime); |
| 4102 | account_group_user_time(p, cputime); | ||
| 4073 | 4103 | ||
| 4074 | /* Add user time to cpustat. */ | 4104 | /* Add user time to cpustat. */ |
| 4075 | tmp = cputime_to_cputime64(cputime); | 4105 | tmp = cputime_to_cputime64(cputime); |
| @@ -4094,6 +4124,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
| 4094 | tmp = cputime_to_cputime64(cputime); | 4124 | tmp = cputime_to_cputime64(cputime); |
| 4095 | 4125 | ||
| 4096 | p->utime = cputime_add(p->utime, cputime); | 4126 | p->utime = cputime_add(p->utime, cputime); |
| 4127 | account_group_user_time(p, cputime); | ||
| 4097 | p->gtime = cputime_add(p->gtime, cputime); | 4128 | p->gtime = cputime_add(p->gtime, cputime); |
| 4098 | 4129 | ||
| 4099 | cpustat->user = cputime64_add(cpustat->user, tmp); | 4130 | cpustat->user = cputime64_add(cpustat->user, tmp); |
| @@ -4129,6 +4160,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 4129 | } | 4160 | } |
| 4130 | 4161 | ||
| 4131 | p->stime = cputime_add(p->stime, cputime); | 4162 | p->stime = cputime_add(p->stime, cputime); |
| 4163 | account_group_system_time(p, cputime); | ||
| 4132 | 4164 | ||
| 4133 | /* Add system time to cpustat. */ | 4165 | /* Add system time to cpustat. */ |
| 4134 | tmp = cputime_to_cputime64(cputime); | 4166 | tmp = cputime_to_cputime64(cputime); |
| @@ -4170,6 +4202,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 4170 | 4202 | ||
| 4171 | if (p == rq->idle) { | 4203 | if (p == rq->idle) { |
| 4172 | p->stime = cputime_add(p->stime, steal); | 4204 | p->stime = cputime_add(p->stime, steal); |
| 4205 | account_group_system_time(p, steal); | ||
| 4173 | if (atomic_read(&rq->nr_iowait) > 0) | 4206 | if (atomic_read(&rq->nr_iowait) > 0) |
| 4174 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 4207 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
| 4175 | else | 4208 | else |
| @@ -4426,12 +4459,8 @@ need_resched_nonpreemptible: | |||
| 4426 | if (sched_feat(HRTICK)) | 4459 | if (sched_feat(HRTICK)) |
| 4427 | hrtick_clear(rq); | 4460 | hrtick_clear(rq); |
| 4428 | 4461 | ||
| 4429 | /* | 4462 | spin_lock_irq(&rq->lock); |
| 4430 | * Do the rq-clock update outside the rq lock: | ||
| 4431 | */ | ||
| 4432 | local_irq_disable(); | ||
| 4433 | update_rq_clock(rq); | 4463 | update_rq_clock(rq); |
| 4434 | spin_lock(&rq->lock); | ||
| 4435 | clear_tsk_need_resched(prev); | 4464 | clear_tsk_need_resched(prev); |
| 4436 | 4465 | ||
| 4437 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4466 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| @@ -4627,6 +4656,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | |||
| 4627 | } | 4656 | } |
| 4628 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4657 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
| 4629 | 4658 | ||
| 4659 | /** | ||
| 4660 | * complete: - signals a single thread waiting on this completion | ||
| 4661 | * @x: holds the state of this particular completion | ||
| 4662 | * | ||
| 4663 | * This will wake up a single thread waiting on this completion. Threads will be | ||
| 4664 | * awakened in the same order in which they were queued. | ||
| 4665 | * | ||
| 4666 | * See also complete_all(), wait_for_completion() and related routines. | ||
| 4667 | */ | ||
| 4630 | void complete(struct completion *x) | 4668 | void complete(struct completion *x) |
| 4631 | { | 4669 | { |
| 4632 | unsigned long flags; | 4670 | unsigned long flags; |
| @@ -4638,6 +4676,12 @@ void complete(struct completion *x) | |||
| 4638 | } | 4676 | } |
| 4639 | EXPORT_SYMBOL(complete); | 4677 | EXPORT_SYMBOL(complete); |
| 4640 | 4678 | ||
| 4679 | /** | ||
| 4680 | * complete_all: - signals all threads waiting on this completion | ||
| 4681 | * @x: holds the state of this particular completion | ||
| 4682 | * | ||
| 4683 | * This will wake up all threads waiting on this particular completion event. | ||
| 4684 | */ | ||
| 4641 | void complete_all(struct completion *x) | 4685 | void complete_all(struct completion *x) |
| 4642 | { | 4686 | { |
| 4643 | unsigned long flags; | 4687 | unsigned long flags; |
| @@ -4658,10 +4702,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
| 4658 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4702 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
| 4659 | __add_wait_queue_tail(&x->wait, &wait); | 4703 | __add_wait_queue_tail(&x->wait, &wait); |
| 4660 | do { | 4704 | do { |
| 4661 | if ((state == TASK_INTERRUPTIBLE && | 4705 | if (signal_pending_state(state, current)) { |
| 4662 | signal_pending(current)) || | ||
| 4663 | (state == TASK_KILLABLE && | ||
| 4664 | fatal_signal_pending(current))) { | ||
| 4665 | timeout = -ERESTARTSYS; | 4706 | timeout = -ERESTARTSYS; |
| 4666 | break; | 4707 | break; |
| 4667 | } | 4708 | } |
| @@ -4689,12 +4730,31 @@ wait_for_common(struct completion *x, long timeout, int state) | |||
| 4689 | return timeout; | 4730 | return timeout; |
| 4690 | } | 4731 | } |
| 4691 | 4732 | ||
| 4733 | /** | ||
| 4734 | * wait_for_completion: - waits for completion of a task | ||
| 4735 | * @x: holds the state of this particular completion | ||
| 4736 | * | ||
| 4737 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 4738 | * interruptible and there is no timeout. | ||
| 4739 | * | ||
| 4740 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
| 4741 | * and interrupt capability. Also see complete(). | ||
| 4742 | */ | ||
| 4692 | void __sched wait_for_completion(struct completion *x) | 4743 | void __sched wait_for_completion(struct completion *x) |
| 4693 | { | 4744 | { |
| 4694 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4745 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
| 4695 | } | 4746 | } |
| 4696 | EXPORT_SYMBOL(wait_for_completion); | 4747 | EXPORT_SYMBOL(wait_for_completion); |
| 4697 | 4748 | ||
| 4749 | /** | ||
| 4750 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
| 4751 | * @x: holds the state of this particular completion | ||
| 4752 | * @timeout: timeout value in jiffies | ||
| 4753 | * | ||
| 4754 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 4755 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 4756 | * interruptible. | ||
| 4757 | */ | ||
| 4698 | unsigned long __sched | 4758 | unsigned long __sched |
| 4699 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4759 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
| 4700 | { | 4760 | { |
| @@ -4702,6 +4762,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
| 4702 | } | 4762 | } |
| 4703 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4763 | EXPORT_SYMBOL(wait_for_completion_timeout); |
| 4704 | 4764 | ||
| 4765 | /** | ||
| 4766 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
| 4767 | * @x: holds the state of this particular completion | ||
| 4768 | * | ||
| 4769 | * This waits for completion of a specific task to be signaled. It is | ||
| 4770 | * interruptible. | ||
| 4771 | */ | ||
| 4705 | int __sched wait_for_completion_interruptible(struct completion *x) | 4772 | int __sched wait_for_completion_interruptible(struct completion *x) |
| 4706 | { | 4773 | { |
| 4707 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4774 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
| @@ -4711,6 +4778,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) | |||
| 4711 | } | 4778 | } |
| 4712 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4779 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
| 4713 | 4780 | ||
| 4781 | /** | ||
| 4782 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
| 4783 | * @x: holds the state of this particular completion | ||
| 4784 | * @timeout: timeout value in jiffies | ||
| 4785 | * | ||
| 4786 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 4787 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
| 4788 | */ | ||
| 4714 | unsigned long __sched | 4789 | unsigned long __sched |
| 4715 | wait_for_completion_interruptible_timeout(struct completion *x, | 4790 | wait_for_completion_interruptible_timeout(struct completion *x, |
| 4716 | unsigned long timeout) | 4791 | unsigned long timeout) |
| @@ -4719,6 +4794,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
| 4719 | } | 4794 | } |
| 4720 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4795 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
| 4721 | 4796 | ||
| 4797 | /** | ||
| 4798 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
| 4799 | * @x: holds the state of this particular completion | ||
| 4800 | * | ||
| 4801 | * This waits to be signaled for completion of a specific task. It can be | ||
| 4802 | * interrupted by a kill signal. | ||
| 4803 | */ | ||
| 4722 | int __sched wait_for_completion_killable(struct completion *x) | 4804 | int __sched wait_for_completion_killable(struct completion *x) |
| 4723 | { | 4805 | { |
| 4724 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4806 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
| @@ -5121,7 +5203,8 @@ recheck: | |||
| 5121 | * Do not allow realtime tasks into groups that have no runtime | 5203 | * Do not allow realtime tasks into groups that have no runtime |
| 5122 | * assigned. | 5204 | * assigned. |
| 5123 | */ | 5205 | */ |
| 5124 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5206 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
| 5207 | task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
| 5125 | return -EPERM; | 5208 | return -EPERM; |
| 5126 | #endif | 5209 | #endif |
| 5127 | 5210 | ||
| @@ -5787,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5787 | struct rq *rq = cpu_rq(cpu); | 5870 | struct rq *rq = cpu_rq(cpu); |
| 5788 | unsigned long flags; | 5871 | unsigned long flags; |
| 5789 | 5872 | ||
| 5873 | spin_lock_irqsave(&rq->lock, flags); | ||
| 5874 | |||
| 5790 | __sched_fork(idle); | 5875 | __sched_fork(idle); |
| 5791 | idle->se.exec_start = sched_clock(); | 5876 | idle->se.exec_start = sched_clock(); |
| 5792 | 5877 | ||
| @@ -5794,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5794 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 5879 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
| 5795 | __set_task_cpu(idle, cpu); | 5880 | __set_task_cpu(idle, cpu); |
| 5796 | 5881 | ||
| 5797 | spin_lock_irqsave(&rq->lock, flags); | ||
| 5798 | rq->curr = rq->idle = idle; | 5882 | rq->curr = rq->idle = idle; |
| 5799 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5883 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
| 5800 | idle->oncpu = 1; | 5884 | idle->oncpu = 1; |
| @@ -5957,7 +6041,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 5957 | set_task_cpu(p, dest_cpu); | 6041 | set_task_cpu(p, dest_cpu); |
| 5958 | if (on_rq) { | 6042 | if (on_rq) { |
| 5959 | activate_task(rq_dest, p, 0); | 6043 | activate_task(rq_dest, p, 0); |
| 5960 | check_preempt_curr(rq_dest, p); | 6044 | check_preempt_curr(rq_dest, p, 0); |
| 5961 | } | 6045 | } |
| 5962 | done: | 6046 | done: |
| 5963 | ret = 1; | 6047 | ret = 1; |
| @@ -6282,7 +6366,7 @@ set_table_entry(struct ctl_table *entry, | |||
| 6282 | static struct ctl_table * | 6366 | static struct ctl_table * |
| 6283 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 6367 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
| 6284 | { | 6368 | { |
| 6285 | struct ctl_table *table = sd_alloc_ctl_entry(12); | 6369 | struct ctl_table *table = sd_alloc_ctl_entry(13); |
| 6286 | 6370 | ||
| 6287 | if (table == NULL) | 6371 | if (table == NULL) |
| 6288 | return NULL; | 6372 | return NULL; |
| @@ -6310,7 +6394,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 6310 | sizeof(int), 0644, proc_dointvec_minmax); | 6394 | sizeof(int), 0644, proc_dointvec_minmax); |
| 6311 | set_table_entry(&table[10], "flags", &sd->flags, | 6395 | set_table_entry(&table[10], "flags", &sd->flags, |
| 6312 | sizeof(int), 0644, proc_dointvec_minmax); | 6396 | sizeof(int), 0644, proc_dointvec_minmax); |
| 6313 | /* &table[11] is terminator */ | 6397 | set_table_entry(&table[11], "name", sd->name, |
| 6398 | CORENAME_MAX_SIZE, 0444, proc_dostring); | ||
| 6399 | /* &table[12] is terminator */ | ||
| 6314 | 6400 | ||
| 6315 | return table; | 6401 | return table; |
| 6316 | } | 6402 | } |
| @@ -6802,15 +6888,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 6802 | struct sched_domain *tmp; | 6888 | struct sched_domain *tmp; |
| 6803 | 6889 | ||
| 6804 | /* Remove the sched domains which do not contribute to scheduling. */ | 6890 | /* Remove the sched domains which do not contribute to scheduling. */ |
| 6805 | for (tmp = sd; tmp; tmp = tmp->parent) { | 6891 | for (tmp = sd; tmp; ) { |
| 6806 | struct sched_domain *parent = tmp->parent; | 6892 | struct sched_domain *parent = tmp->parent; |
| 6807 | if (!parent) | 6893 | if (!parent) |
| 6808 | break; | 6894 | break; |
| 6895 | |||
| 6809 | if (sd_parent_degenerate(tmp, parent)) { | 6896 | if (sd_parent_degenerate(tmp, parent)) { |
| 6810 | tmp->parent = parent->parent; | 6897 | tmp->parent = parent->parent; |
| 6811 | if (parent->parent) | 6898 | if (parent->parent) |
| 6812 | parent->parent->child = tmp; | 6899 | parent->parent->child = tmp; |
| 6813 | } | 6900 | } else |
| 6901 | tmp = tmp->parent; | ||
| 6814 | } | 6902 | } |
| 6815 | 6903 | ||
| 6816 | if (sd && sd_degenerate(sd)) { | 6904 | if (sd && sd_degenerate(sd)) { |
| @@ -7194,13 +7282,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 7194 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 7282 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
| 7195 | */ | 7283 | */ |
| 7196 | 7284 | ||
| 7285 | #ifdef CONFIG_SCHED_DEBUG | ||
| 7286 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
| 7287 | #else | ||
| 7288 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
| 7289 | #endif | ||
| 7290 | |||
| 7197 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7291 | #define SD_INIT(sd, type) sd_init_##type(sd) |
| 7292 | |||
| 7198 | #define SD_INIT_FUNC(type) \ | 7293 | #define SD_INIT_FUNC(type) \ |
| 7199 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7294 | static noinline void sd_init_##type(struct sched_domain *sd) \ |
| 7200 | { \ | 7295 | { \ |
| 7201 | memset(sd, 0, sizeof(*sd)); \ | 7296 | memset(sd, 0, sizeof(*sd)); \ |
| 7202 | *sd = SD_##type##_INIT; \ | 7297 | *sd = SD_##type##_INIT; \ |
| 7203 | sd->level = SD_LV_##type; \ | 7298 | sd->level = SD_LV_##type; \ |
| 7299 | SD_INIT_NAME(sd, type); \ | ||
| 7204 | } | 7300 | } |
| 7205 | 7301 | ||
| 7206 | SD_INIT_FUNC(CPU) | 7302 | SD_INIT_FUNC(CPU) |
| @@ -7591,6 +7687,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
| 7591 | error: | 7687 | error: |
| 7592 | free_sched_groups(cpu_map, tmpmask); | 7688 | free_sched_groups(cpu_map, tmpmask); |
| 7593 | SCHED_CPUMASK_FREE((void *)allmasks); | 7689 | SCHED_CPUMASK_FREE((void *)allmasks); |
| 7690 | kfree(rd); | ||
| 7594 | return -ENOMEM; | 7691 | return -ENOMEM; |
| 7595 | #endif | 7692 | #endif |
| 7596 | } | 7693 | } |
| @@ -8242,20 +8339,25 @@ void __might_sleep(char *file, int line) | |||
| 8242 | #ifdef in_atomic | 8339 | #ifdef in_atomic |
| 8243 | static unsigned long prev_jiffy; /* ratelimiting */ | 8340 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 8244 | 8341 | ||
| 8245 | if ((in_atomic() || irqs_disabled()) && | 8342 | if ((!in_atomic() && !irqs_disabled()) || |
| 8246 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 8343 | system_state != SYSTEM_RUNNING || oops_in_progress) |
| 8247 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8344 | return; |
| 8248 | return; | 8345 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 8249 | prev_jiffy = jiffies; | 8346 | return; |
| 8250 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 8347 | prev_jiffy = jiffies; |
| 8251 | " context at %s:%d\n", file, line); | 8348 | |
| 8252 | printk("in_atomic():%d, irqs_disabled():%d\n", | 8349 | printk(KERN_ERR |
| 8253 | in_atomic(), irqs_disabled()); | 8350 | "BUG: sleeping function called from invalid context at %s:%d\n", |
| 8254 | debug_show_held_locks(current); | 8351 | file, line); |
| 8255 | if (irqs_disabled()) | 8352 | printk(KERN_ERR |
| 8256 | print_irqtrace_events(current); | 8353 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
| 8257 | dump_stack(); | 8354 | in_atomic(), irqs_disabled(), |
| 8258 | } | 8355 | current->pid, current->comm); |
| 8356 | |||
| 8357 | debug_show_held_locks(current); | ||
| 8358 | if (irqs_disabled()) | ||
| 8359 | print_irqtrace_events(current); | ||
| 8360 | dump_stack(); | ||
| 8259 | #endif | 8361 | #endif |
| 8260 | } | 8362 | } |
| 8261 | EXPORT_SYMBOL(__might_sleep); | 8363 | EXPORT_SYMBOL(__might_sleep); |
| @@ -8753,73 +8855,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); | |||
| 8753 | static unsigned long to_ratio(u64 period, u64 runtime) | 8855 | static unsigned long to_ratio(u64 period, u64 runtime) |
| 8754 | { | 8856 | { |
| 8755 | if (runtime == RUNTIME_INF) | 8857 | if (runtime == RUNTIME_INF) |
| 8756 | return 1ULL << 16; | 8858 | return 1ULL << 20; |
| 8757 | 8859 | ||
| 8758 | return div64_u64(runtime << 16, period); | 8860 | return div64_u64(runtime << 20, period); |
| 8759 | } | 8861 | } |
| 8760 | 8862 | ||
| 8761 | #ifdef CONFIG_CGROUP_SCHED | 8863 | /* Must be called with tasklist_lock held */ |
| 8762 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8864 | static inline int tg_has_rt_tasks(struct task_group *tg) |
| 8763 | { | 8865 | { |
| 8764 | struct task_group *tgi, *parent = tg->parent; | 8866 | struct task_struct *g, *p; |
| 8765 | unsigned long total = 0; | ||
| 8766 | 8867 | ||
| 8767 | if (!parent) { | 8868 | do_each_thread(g, p) { |
| 8768 | if (global_rt_period() < period) | 8869 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
| 8769 | return 0; | 8870 | return 1; |
| 8871 | } while_each_thread(g, p); | ||
| 8770 | 8872 | ||
| 8771 | return to_ratio(period, runtime) < | 8873 | return 0; |
| 8772 | to_ratio(global_rt_period(), global_rt_runtime()); | 8874 | } |
| 8773 | } | ||
| 8774 | 8875 | ||
| 8775 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | 8876 | struct rt_schedulable_data { |
| 8776 | return 0; | 8877 | struct task_group *tg; |
| 8878 | u64 rt_period; | ||
| 8879 | u64 rt_runtime; | ||
| 8880 | }; | ||
| 8777 | 8881 | ||
| 8778 | rcu_read_lock(); | 8882 | static int tg_schedulable(struct task_group *tg, void *data) |
| 8779 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | 8883 | { |
| 8780 | if (tgi == tg) | 8884 | struct rt_schedulable_data *d = data; |
| 8781 | continue; | 8885 | struct task_group *child; |
| 8886 | unsigned long total, sum = 0; | ||
| 8887 | u64 period, runtime; | ||
| 8888 | |||
| 8889 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 8890 | runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8782 | 8891 | ||
| 8783 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8892 | if (tg == d->tg) { |
| 8784 | tgi->rt_bandwidth.rt_runtime); | 8893 | period = d->rt_period; |
| 8894 | runtime = d->rt_runtime; | ||
| 8785 | } | 8895 | } |
| 8786 | rcu_read_unlock(); | ||
| 8787 | 8896 | ||
| 8788 | return total + to_ratio(period, runtime) <= | 8897 | /* |
| 8789 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8898 | * Cannot have more runtime than the period. |
| 8790 | parent->rt_bandwidth.rt_runtime); | 8899 | */ |
| 8791 | } | 8900 | if (runtime > period && runtime != RUNTIME_INF) |
| 8792 | #elif defined CONFIG_USER_SCHED | 8901 | return -EINVAL; |
| 8793 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
| 8794 | { | ||
| 8795 | struct task_group *tgi; | ||
| 8796 | unsigned long total = 0; | ||
| 8797 | unsigned long global_ratio = | ||
| 8798 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 8799 | 8902 | ||
| 8800 | rcu_read_lock(); | 8903 | /* |
| 8801 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8904 | * Ensure we don't starve existing RT tasks. |
| 8802 | if (tgi == tg) | 8905 | */ |
| 8803 | continue; | 8906 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
| 8907 | return -EBUSY; | ||
| 8908 | |||
| 8909 | total = to_ratio(period, runtime); | ||
| 8804 | 8910 | ||
| 8805 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8911 | /* |
| 8806 | tgi->rt_bandwidth.rt_runtime); | 8912 | * Nobody can have more than the global setting allows. |
| 8913 | */ | ||
| 8914 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
| 8915 | return -EINVAL; | ||
| 8916 | |||
| 8917 | /* | ||
| 8918 | * The sum of our children's runtime should not exceed our own. | ||
| 8919 | */ | ||
| 8920 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
| 8921 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
| 8922 | runtime = child->rt_bandwidth.rt_runtime; | ||
| 8923 | |||
| 8924 | if (child == d->tg) { | ||
| 8925 | period = d->rt_period; | ||
| 8926 | runtime = d->rt_runtime; | ||
| 8927 | } | ||
| 8928 | |||
| 8929 | sum += to_ratio(period, runtime); | ||
| 8807 | } | 8930 | } |
| 8808 | rcu_read_unlock(); | ||
| 8809 | 8931 | ||
| 8810 | return total + to_ratio(period, runtime) < global_ratio; | 8932 | if (sum > total) |
| 8933 | return -EINVAL; | ||
| 8934 | |||
| 8935 | return 0; | ||
| 8811 | } | 8936 | } |
| 8812 | #endif | ||
| 8813 | 8937 | ||
| 8814 | /* Must be called with tasklist_lock held */ | 8938 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| 8815 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
| 8816 | { | 8939 | { |
| 8817 | struct task_struct *g, *p; | 8940 | struct rt_schedulable_data data = { |
| 8818 | do_each_thread(g, p) { | 8941 | .tg = tg, |
| 8819 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8942 | .rt_period = period, |
| 8820 | return 1; | 8943 | .rt_runtime = runtime, |
| 8821 | } while_each_thread(g, p); | 8944 | }; |
| 8822 | return 0; | 8945 | |
| 8946 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | ||
| 8823 | } | 8947 | } |
| 8824 | 8948 | ||
| 8825 | static int tg_set_bandwidth(struct task_group *tg, | 8949 | static int tg_set_bandwidth(struct task_group *tg, |
| @@ -8829,14 +8953,9 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
| 8829 | 8953 | ||
| 8830 | mutex_lock(&rt_constraints_mutex); | 8954 | mutex_lock(&rt_constraints_mutex); |
| 8831 | read_lock(&tasklist_lock); | 8955 | read_lock(&tasklist_lock); |
| 8832 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { | 8956 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
| 8833 | err = -EBUSY; | 8957 | if (err) |
| 8834 | goto unlock; | ||
| 8835 | } | ||
| 8836 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | ||
| 8837 | err = -EINVAL; | ||
| 8838 | goto unlock; | 8958 | goto unlock; |
| 8839 | } | ||
| 8840 | 8959 | ||
| 8841 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8960 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
| 8842 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8961 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
| @@ -8905,19 +9024,25 @@ long sched_group_rt_period(struct task_group *tg) | |||
| 8905 | 9024 | ||
| 8906 | static int sched_rt_global_constraints(void) | 9025 | static int sched_rt_global_constraints(void) |
| 8907 | { | 9026 | { |
| 8908 | struct task_group *tg = &root_task_group; | 9027 | u64 runtime, period; |
| 8909 | u64 rt_runtime, rt_period; | ||
| 8910 | int ret = 0; | 9028 | int ret = 0; |
| 8911 | 9029 | ||
| 8912 | if (sysctl_sched_rt_period <= 0) | 9030 | if (sysctl_sched_rt_period <= 0) |
| 8913 | return -EINVAL; | 9031 | return -EINVAL; |
| 8914 | 9032 | ||
| 8915 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 9033 | runtime = global_rt_runtime(); |
| 8916 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 9034 | period = global_rt_period(); |
| 9035 | |||
| 9036 | /* | ||
| 9037 | * Sanity check on the sysctl variables. | ||
| 9038 | */ | ||
| 9039 | if (runtime > period && runtime != RUNTIME_INF) | ||
| 9040 | return -EINVAL; | ||
| 8917 | 9041 | ||
| 8918 | mutex_lock(&rt_constraints_mutex); | 9042 | mutex_lock(&rt_constraints_mutex); |
| 8919 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) | 9043 | read_lock(&tasklist_lock); |
| 8920 | ret = -EINVAL; | 9044 | ret = __rt_schedulable(NULL, 0, 0); |
| 9045 | read_unlock(&tasklist_lock); | ||
| 8921 | mutex_unlock(&rt_constraints_mutex); | 9046 | mutex_unlock(&rt_constraints_mutex); |
| 8922 | 9047 | ||
| 8923 | return ret; | 9048 | return ret; |
| @@ -8991,7 +9116,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 8991 | 9116 | ||
| 8992 | if (!cgrp->parent) { | 9117 | if (!cgrp->parent) { |
| 8993 | /* This is early initialization for the top cgroup */ | 9118 | /* This is early initialization for the top cgroup */ |
| 8994 | init_task_group.css.cgroup = cgrp; | ||
| 8995 | return &init_task_group.css; | 9119 | return &init_task_group.css; |
| 8996 | } | 9120 | } |
| 8997 | 9121 | ||
| @@ -9000,9 +9124,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 9000 | if (IS_ERR(tg)) | 9124 | if (IS_ERR(tg)) |
| 9001 | return ERR_PTR(-ENOMEM); | 9125 | return ERR_PTR(-ENOMEM); |
| 9002 | 9126 | ||
| 9003 | /* Bind the cgroup to task_group object we just created */ | ||
| 9004 | tg->css.cgroup = cgrp; | ||
| 9005 | |||
| 9006 | return &tg->css; | 9127 | return &tg->css; |
| 9007 | } | 9128 | } |
| 9008 | 9129 | ||
