diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 24 | ||||
-rw-r--r-- | kernel/cpuset.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 377 | ||||
-rw-r--r-- | kernel/sched_fair.c | 234 | ||||
-rw-r--r-- | kernel/sched_features.h | 1 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 6 | ||||
-rw-r--r-- | kernel/sched_rt.c | 57 | ||||
-rw-r--r-- | kernel/user.c | 4 |
8 files changed, 378 insertions, 327 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..86d49045daed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) | |||
199 | struct take_cpu_down_param *param = _param; | 199 | struct take_cpu_down_param *param = _param; |
200 | int err; | 200 | int err; |
201 | 201 | ||
202 | raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, | ||
203 | param->hcpu); | ||
204 | /* Ensure this CPU doesn't handle any more interrupts. */ | 202 | /* Ensure this CPU doesn't handle any more interrupts. */ |
205 | err = __cpu_disable(); | 203 | err = __cpu_disable(); |
206 | if (err < 0) | 204 | if (err < 0) |
207 | return err; | 205 | return err; |
208 | 206 | ||
207 | raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, | ||
208 | param->hcpu); | ||
209 | |||
209 | /* Force idle task to run as soon as we yield: it should | 210 | /* Force idle task to run as soon as we yield: it should |
210 | immediately notice cpu is offline and die quickly. */ | 211 | immediately notice cpu is offline and die quickly. */ |
211 | sched_idle_next(); | 212 | sched_idle_next(); |
@@ -453,6 +454,25 @@ out: | |||
453 | } | 454 | } |
454 | #endif /* CONFIG_PM_SLEEP_SMP */ | 455 | #endif /* CONFIG_PM_SLEEP_SMP */ |
455 | 456 | ||
457 | /** | ||
458 | * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers | ||
459 | * @cpu: cpu that just started | ||
460 | * | ||
461 | * This function calls the cpu_chain notifiers with CPU_STARTING. | ||
462 | * It must be called by the arch code on the new cpu, before the new cpu | ||
463 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). | ||
464 | */ | ||
465 | void notify_cpu_starting(unsigned int cpu) | ||
466 | { | ||
467 | unsigned long val = CPU_STARTING; | ||
468 | |||
469 | #ifdef CONFIG_PM_SLEEP_SMP | ||
470 | if (cpu_isset(cpu, frozen_cpus)) | ||
471 | val = CPU_STARTING_FROZEN; | ||
472 | #endif /* CONFIG_PM_SLEEP_SMP */ | ||
473 | raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); | ||
474 | } | ||
475 | |||
456 | #endif /* CONFIG_SMP */ | 476 | #endif /* CONFIG_SMP */ |
457 | 477 | ||
458 | /* | 478 | /* |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb2..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1921 | * that has tasks along with an empty 'mems'. But if we did see such | 1921 | * that has tasks along with an empty 'mems'. But if we did see such |
1922 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | 1922 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. |
1923 | */ | 1923 | */ |
1924 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1924 | static void scan_for_empty_cpusets(struct cpuset *root) |
1925 | { | 1925 | { |
1926 | LIST_HEAD(queue); | 1926 | LIST_HEAD(queue); |
1927 | struct cpuset *cp; /* scans cpusets being updated */ | 1927 | struct cpuset *cp; /* scans cpusets being updated */ |
diff --git a/kernel/sched.c b/kernel/sched.c index ad1962dc0aa2..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; | 204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; |
205 | } | 205 | } |
206 | 206 | ||
207 | static inline int rt_bandwidth_enabled(void) | ||
208 | { | ||
209 | return sysctl_sched_rt_runtime >= 0; | ||
210 | } | ||
211 | |||
207 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 212 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
208 | { | 213 | { |
209 | ktime_t now; | 214 | ktime_t now; |
210 | 215 | ||
211 | if (rt_b->rt_runtime == RUNTIME_INF) | 216 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) |
212 | return; | 217 | return; |
213 | 218 | ||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | 219 | if (hrtimer_active(&rt_b->rt_period_timer)) |
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 303 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 304 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
300 | #endif /* CONFIG_RT_GROUP_SCHED */ | 305 | #endif /* CONFIG_RT_GROUP_SCHED */ |
301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | 306 | #else /* !CONFIG_USER_SCHED */ |
302 | #define root_task_group init_task_group | 307 | #define root_task_group init_task_group |
303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 308 | #endif /* CONFIG_USER_SCHED */ |
304 | 309 | ||
305 | /* task_group_lock serializes add/remove of task groups and also changes to | 310 | /* task_group_lock serializes add/remove of task groups and also changes to |
306 | * a task group's cpu shares. | 311 | * a task group's cpu shares. |
@@ -604,9 +609,9 @@ struct rq { | |||
604 | 609 | ||
605 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 610 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
606 | 611 | ||
607 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 612 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) |
608 | { | 613 | { |
609 | rq->curr->sched_class->check_preempt_curr(rq, p); | 614 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); |
610 | } | 615 | } |
611 | 616 | ||
612 | static inline int cpu_of(struct rq *rq) | 617 | static inline int cpu_of(struct rq *rq) |
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
1102 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); | 1107 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); |
1103 | } | 1108 | } |
1104 | 1109 | ||
1105 | static void init_hrtick(void) | 1110 | static inline void init_hrtick(void) |
1106 | { | 1111 | { |
1107 | } | 1112 | } |
1108 | #endif /* CONFIG_SMP */ | 1113 | #endif /* CONFIG_SMP */ |
@@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq) | |||
1121 | rq->hrtick_timer.function = hrtick; | 1126 | rq->hrtick_timer.function = hrtick; |
1122 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; | 1127 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; |
1123 | } | 1128 | } |
1124 | #else | 1129 | #else /* CONFIG_SCHED_HRTICK */ |
1125 | static inline void hrtick_clear(struct rq *rq) | 1130 | static inline void hrtick_clear(struct rq *rq) |
1126 | { | 1131 | { |
1127 | } | 1132 | } |
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) | |||
1133 | static inline void init_hrtick(void) | 1138 | static inline void init_hrtick(void) |
1134 | { | 1139 | { |
1135 | } | 1140 | } |
1136 | #endif | 1141 | #endif /* CONFIG_SCHED_HRTICK */ |
1137 | 1142 | ||
1138 | /* | 1143 | /* |
1139 | * resched_task - mark a task 'to be rescheduled now'. | 1144 | * resched_task - mark a task 'to be rescheduled now'. |
@@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1380 | update_load_sub(&rq->load, load); | 1385 | update_load_sub(&rq->load, load); |
1381 | } | 1386 | } |
1382 | 1387 | ||
1383 | #ifdef CONFIG_SMP | 1388 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) |
1384 | static unsigned long source_load(int cpu, int type); | 1389 | typedef int (*tg_visitor)(struct task_group *, void *); |
1385 | static unsigned long target_load(int cpu, int type); | ||
1386 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1387 | |||
1388 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1389 | { | ||
1390 | struct rq *rq = cpu_rq(cpu); | ||
1391 | |||
1392 | if (rq->nr_running) | ||
1393 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1394 | |||
1395 | return rq->avg_load_per_task; | ||
1396 | } | ||
1397 | |||
1398 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1399 | |||
1400 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
1401 | 1390 | ||
1402 | /* | 1391 | /* |
1403 | * Iterate the full tree, calling @down when first entering a node and @up when | 1392 | * Iterate the full tree, calling @down when first entering a node and @up when |
1404 | * leaving it for the final time. | 1393 | * leaving it for the final time. |
1405 | */ | 1394 | */ |
1406 | static void | 1395 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) |
1407 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
1408 | { | 1396 | { |
1409 | struct task_group *parent, *child; | 1397 | struct task_group *parent, *child; |
1398 | int ret; | ||
1410 | 1399 | ||
1411 | rcu_read_lock(); | 1400 | rcu_read_lock(); |
1412 | parent = &root_task_group; | 1401 | parent = &root_task_group; |
1413 | down: | 1402 | down: |
1414 | (*down)(parent, cpu, sd); | 1403 | ret = (*down)(parent, data); |
1404 | if (ret) | ||
1405 | goto out_unlock; | ||
1415 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1406 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1416 | parent = child; | 1407 | parent = child; |
1417 | goto down; | 1408 | goto down; |
@@ -1419,15 +1410,43 @@ down: | |||
1419 | up: | 1410 | up: |
1420 | continue; | 1411 | continue; |
1421 | } | 1412 | } |
1422 | (*up)(parent, cpu, sd); | 1413 | ret = (*up)(parent, data); |
1414 | if (ret) | ||
1415 | goto out_unlock; | ||
1423 | 1416 | ||
1424 | child = parent; | 1417 | child = parent; |
1425 | parent = parent->parent; | 1418 | parent = parent->parent; |
1426 | if (parent) | 1419 | if (parent) |
1427 | goto up; | 1420 | goto up; |
1421 | out_unlock: | ||
1428 | rcu_read_unlock(); | 1422 | rcu_read_unlock(); |
1423 | |||
1424 | return ret; | ||
1429 | } | 1425 | } |
1430 | 1426 | ||
1427 | static int tg_nop(struct task_group *tg, void *data) | ||
1428 | { | ||
1429 | return 0; | ||
1430 | } | ||
1431 | #endif | ||
1432 | |||
1433 | #ifdef CONFIG_SMP | ||
1434 | static unsigned long source_load(int cpu, int type); | ||
1435 | static unsigned long target_load(int cpu, int type); | ||
1436 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1437 | |||
1438 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1439 | { | ||
1440 | struct rq *rq = cpu_rq(cpu); | ||
1441 | |||
1442 | if (rq->nr_running) | ||
1443 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1444 | |||
1445 | return rq->avg_load_per_task; | ||
1446 | } | ||
1447 | |||
1448 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1449 | |||
1431 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1450 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1432 | 1451 | ||
1433 | /* | 1452 | /* |
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1486 | * This needs to be done in a bottom-up fashion because the rq weight of a | 1505 | * This needs to be done in a bottom-up fashion because the rq weight of a |
1487 | * parent group depends on the shares of its child groups. | 1506 | * parent group depends on the shares of its child groups. |
1488 | */ | 1507 | */ |
1489 | static void | 1508 | static int tg_shares_up(struct task_group *tg, void *data) |
1490 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1491 | { | 1509 | { |
1492 | unsigned long rq_weight = 0; | 1510 | unsigned long rq_weight = 0; |
1493 | unsigned long shares = 0; | 1511 | unsigned long shares = 0; |
1512 | struct sched_domain *sd = data; | ||
1494 | int i; | 1513 | int i; |
1495 | 1514 | ||
1496 | for_each_cpu_mask(i, sd->span) { | 1515 | for_each_cpu_mask(i, sd->span) { |
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1515 | __update_group_shares_cpu(tg, i, shares, rq_weight); | 1534 | __update_group_shares_cpu(tg, i, shares, rq_weight); |
1516 | spin_unlock_irqrestore(&rq->lock, flags); | 1535 | spin_unlock_irqrestore(&rq->lock, flags); |
1517 | } | 1536 | } |
1537 | |||
1538 | return 0; | ||
1518 | } | 1539 | } |
1519 | 1540 | ||
1520 | /* | 1541 | /* |
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1522 | * This needs to be done in a top-down fashion because the load of a child | 1543 | * This needs to be done in a top-down fashion because the load of a child |
1523 | * group is a fraction of its parents load. | 1544 | * group is a fraction of its parents load. |
1524 | */ | 1545 | */ |
1525 | static void | 1546 | static int tg_load_down(struct task_group *tg, void *data) |
1526 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1527 | { | 1547 | { |
1528 | unsigned long load; | 1548 | unsigned long load; |
1549 | long cpu = (long)data; | ||
1529 | 1550 | ||
1530 | if (!tg->parent) { | 1551 | if (!tg->parent) { |
1531 | load = cpu_rq(cpu)->load.weight; | 1552 | load = cpu_rq(cpu)->load.weight; |
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1536 | } | 1557 | } |
1537 | 1558 | ||
1538 | tg->cfs_rq[cpu]->h_load = load; | 1559 | tg->cfs_rq[cpu]->h_load = load; |
1539 | } | ||
1540 | 1560 | ||
1541 | static void | 1561 | return 0; |
1542 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1543 | { | ||
1544 | } | 1562 | } |
1545 | 1563 | ||
1546 | static void update_shares(struct sched_domain *sd) | 1564 | static void update_shares(struct sched_domain *sd) |
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) | |||
1550 | 1568 | ||
1551 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1569 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1552 | sd->last_update = now; | 1570 | sd->last_update = now; |
1553 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | 1571 | walk_tg_tree(tg_nop, tg_shares_up, sd); |
1554 | } | 1572 | } |
1555 | } | 1573 | } |
1556 | 1574 | ||
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1561 | spin_lock(&rq->lock); | 1579 | spin_lock(&rq->lock); |
1562 | } | 1580 | } |
1563 | 1581 | ||
1564 | static void update_h_load(int cpu) | 1582 | static void update_h_load(long cpu) |
1565 | { | 1583 | { |
1566 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | 1584 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1567 | } | 1585 | } |
1568 | 1586 | ||
1569 | #else | 1587 | #else |
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1921 | running = task_running(rq, p); | 1939 | running = task_running(rq, p); |
1922 | on_rq = p->se.on_rq; | 1940 | on_rq = p->se.on_rq; |
1923 | ncsw = 0; | 1941 | ncsw = 0; |
1924 | if (!match_state || p->state == match_state) { | 1942 | if (!match_state || p->state == match_state) |
1925 | ncsw = p->nivcsw + p->nvcsw; | 1943 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
1926 | if (unlikely(!ncsw)) | ||
1927 | ncsw = 1; | ||
1928 | } | ||
1929 | task_rq_unlock(rq, &flags); | 1944 | task_rq_unlock(rq, &flags); |
1930 | 1945 | ||
1931 | /* | 1946 | /* |
@@ -2285,7 +2300,7 @@ out_running: | |||
2285 | trace_mark(kernel_sched_wakeup, | 2300 | trace_mark(kernel_sched_wakeup, |
2286 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2301 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
2287 | p->pid, p->state, rq, p, rq->curr); | 2302 | p->pid, p->state, rq, p, rq->curr); |
2288 | check_preempt_curr(rq, p); | 2303 | check_preempt_curr(rq, p, sync); |
2289 | 2304 | ||
2290 | p->state = TASK_RUNNING; | 2305 | p->state = TASK_RUNNING; |
2291 | #ifdef CONFIG_SMP | 2306 | #ifdef CONFIG_SMP |
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2420 | trace_mark(kernel_sched_wakeup_new, | 2435 | trace_mark(kernel_sched_wakeup_new, |
2421 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2436 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
2422 | p->pid, p->state, rq, p, rq->curr); | 2437 | p->pid, p->state, rq, p, rq->curr); |
2423 | check_preempt_curr(rq, p); | 2438 | check_preempt_curr(rq, p, 0); |
2424 | #ifdef CONFIG_SMP | 2439 | #ifdef CONFIG_SMP |
2425 | if (p->sched_class->task_wake_up) | 2440 | if (p->sched_class->task_wake_up) |
2426 | p->sched_class->task_wake_up(rq, p); | 2441 | p->sched_class->task_wake_up(rq, p); |
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2880 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2895 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2881 | * to be always true for them. | 2896 | * to be always true for them. |
2882 | */ | 2897 | */ |
2883 | check_preempt_curr(this_rq, p); | 2898 | check_preempt_curr(this_rq, p, 0); |
2884 | } | 2899 | } |
2885 | 2900 | ||
2886 | /* | 2901 | /* |
@@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | |||
4627 | } | 4642 | } |
4628 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4643 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
4629 | 4644 | ||
4645 | /** | ||
4646 | * complete: - signals a single thread waiting on this completion | ||
4647 | * @x: holds the state of this particular completion | ||
4648 | * | ||
4649 | * This will wake up a single thread waiting on this completion. Threads will be | ||
4650 | * awakened in the same order in which they were queued. | ||
4651 | * | ||
4652 | * See also complete_all(), wait_for_completion() and related routines. | ||
4653 | */ | ||
4630 | void complete(struct completion *x) | 4654 | void complete(struct completion *x) |
4631 | { | 4655 | { |
4632 | unsigned long flags; | 4656 | unsigned long flags; |
@@ -4638,6 +4662,12 @@ void complete(struct completion *x) | |||
4638 | } | 4662 | } |
4639 | EXPORT_SYMBOL(complete); | 4663 | EXPORT_SYMBOL(complete); |
4640 | 4664 | ||
4665 | /** | ||
4666 | * complete_all: - signals all threads waiting on this completion | ||
4667 | * @x: holds the state of this particular completion | ||
4668 | * | ||
4669 | * This will wake up all threads waiting on this particular completion event. | ||
4670 | */ | ||
4641 | void complete_all(struct completion *x) | 4671 | void complete_all(struct completion *x) |
4642 | { | 4672 | { |
4643 | unsigned long flags; | 4673 | unsigned long flags; |
@@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4658 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4688 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
4659 | __add_wait_queue_tail(&x->wait, &wait); | 4689 | __add_wait_queue_tail(&x->wait, &wait); |
4660 | do { | 4690 | do { |
4661 | if ((state == TASK_INTERRUPTIBLE && | 4691 | if (signal_pending_state(state, current)) { |
4662 | signal_pending(current)) || | ||
4663 | (state == TASK_KILLABLE && | ||
4664 | fatal_signal_pending(current))) { | ||
4665 | timeout = -ERESTARTSYS; | 4692 | timeout = -ERESTARTSYS; |
4666 | break; | 4693 | break; |
4667 | } | 4694 | } |
@@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) | |||
4689 | return timeout; | 4716 | return timeout; |
4690 | } | 4717 | } |
4691 | 4718 | ||
4719 | /** | ||
4720 | * wait_for_completion: - waits for completion of a task | ||
4721 | * @x: holds the state of this particular completion | ||
4722 | * | ||
4723 | * This waits to be signaled for completion of a specific task. It is NOT | ||
4724 | * interruptible and there is no timeout. | ||
4725 | * | ||
4726 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
4727 | * and interrupt capability. Also see complete(). | ||
4728 | */ | ||
4692 | void __sched wait_for_completion(struct completion *x) | 4729 | void __sched wait_for_completion(struct completion *x) |
4693 | { | 4730 | { |
4694 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4731 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
4695 | } | 4732 | } |
4696 | EXPORT_SYMBOL(wait_for_completion); | 4733 | EXPORT_SYMBOL(wait_for_completion); |
4697 | 4734 | ||
4735 | /** | ||
4736 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
4737 | * @x: holds the state of this particular completion | ||
4738 | * @timeout: timeout value in jiffies | ||
4739 | * | ||
4740 | * This waits for either a completion of a specific task to be signaled or for a | ||
4741 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
4742 | * interruptible. | ||
4743 | */ | ||
4698 | unsigned long __sched | 4744 | unsigned long __sched |
4699 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4745 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
4700 | { | 4746 | { |
@@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
4702 | } | 4748 | } |
4703 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4749 | EXPORT_SYMBOL(wait_for_completion_timeout); |
4704 | 4750 | ||
4751 | /** | ||
4752 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
4753 | * @x: holds the state of this particular completion | ||
4754 | * | ||
4755 | * This waits for completion of a specific task to be signaled. It is | ||
4756 | * interruptible. | ||
4757 | */ | ||
4705 | int __sched wait_for_completion_interruptible(struct completion *x) | 4758 | int __sched wait_for_completion_interruptible(struct completion *x) |
4706 | { | 4759 | { |
4707 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4760 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
@@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) | |||
4711 | } | 4764 | } |
4712 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4765 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
4713 | 4766 | ||
4767 | /** | ||
4768 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
4769 | * @x: holds the state of this particular completion | ||
4770 | * @timeout: timeout value in jiffies | ||
4771 | * | ||
4772 | * This waits for either a completion of a specific task to be signaled or for a | ||
4773 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
4774 | */ | ||
4714 | unsigned long __sched | 4775 | unsigned long __sched |
4715 | wait_for_completion_interruptible_timeout(struct completion *x, | 4776 | wait_for_completion_interruptible_timeout(struct completion *x, |
4716 | unsigned long timeout) | 4777 | unsigned long timeout) |
@@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
4719 | } | 4780 | } |
4720 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4781 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4721 | 4782 | ||
4783 | /** | ||
4784 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
4785 | * @x: holds the state of this particular completion | ||
4786 | * | ||
4787 | * This waits to be signaled for completion of a specific task. It can be | ||
4788 | * interrupted by a kill signal. | ||
4789 | */ | ||
4722 | int __sched wait_for_completion_killable(struct completion *x) | 4790 | int __sched wait_for_completion_killable(struct completion *x) |
4723 | { | 4791 | { |
4724 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4792 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
@@ -5121,7 +5189,8 @@ recheck: | |||
5121 | * Do not allow realtime tasks into groups that have no runtime | 5189 | * Do not allow realtime tasks into groups that have no runtime |
5122 | * assigned. | 5190 | * assigned. |
5123 | */ | 5191 | */ |
5124 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5192 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5193 | task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
5125 | return -EPERM; | 5194 | return -EPERM; |
5126 | #endif | 5195 | #endif |
5127 | 5196 | ||
@@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5957 | set_task_cpu(p, dest_cpu); | 6026 | set_task_cpu(p, dest_cpu); |
5958 | if (on_rq) { | 6027 | if (on_rq) { |
5959 | activate_task(rq_dest, p, 0); | 6028 | activate_task(rq_dest, p, 0); |
5960 | check_preempt_curr(rq_dest, p); | 6029 | check_preempt_curr(rq_dest, p, 0); |
5961 | } | 6030 | } |
5962 | done: | 6031 | done: |
5963 | ret = 1; | 6032 | ret = 1; |
@@ -6282,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, | |||
6282 | static struct ctl_table * | 6351 | static struct ctl_table * |
6283 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 6352 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
6284 | { | 6353 | { |
6285 | struct ctl_table *table = sd_alloc_ctl_entry(12); | 6354 | struct ctl_table *table = sd_alloc_ctl_entry(13); |
6286 | 6355 | ||
6287 | if (table == NULL) | 6356 | if (table == NULL) |
6288 | return NULL; | 6357 | return NULL; |
@@ -6310,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
6310 | sizeof(int), 0644, proc_dointvec_minmax); | 6379 | sizeof(int), 0644, proc_dointvec_minmax); |
6311 | set_table_entry(&table[10], "flags", &sd->flags, | 6380 | set_table_entry(&table[10], "flags", &sd->flags, |
6312 | sizeof(int), 0644, proc_dointvec_minmax); | 6381 | sizeof(int), 0644, proc_dointvec_minmax); |
6313 | /* &table[11] is terminator */ | 6382 | set_table_entry(&table[11], "name", sd->name, |
6383 | CORENAME_MAX_SIZE, 0444, proc_dostring); | ||
6384 | /* &table[12] is terminator */ | ||
6314 | 6385 | ||
6315 | return table; | 6386 | return table; |
6316 | } | 6387 | } |
@@ -7194,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7194 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 7265 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
7195 | */ | 7266 | */ |
7196 | 7267 | ||
7268 | #ifdef CONFIG_SCHED_DEBUG | ||
7269 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
7270 | #else | ||
7271 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
7272 | #endif | ||
7273 | |||
7197 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7274 | #define SD_INIT(sd, type) sd_init_##type(sd) |
7275 | |||
7198 | #define SD_INIT_FUNC(type) \ | 7276 | #define SD_INIT_FUNC(type) \ |
7199 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7277 | static noinline void sd_init_##type(struct sched_domain *sd) \ |
7200 | { \ | 7278 | { \ |
7201 | memset(sd, 0, sizeof(*sd)); \ | 7279 | memset(sd, 0, sizeof(*sd)); \ |
7202 | *sd = SD_##type##_INIT; \ | 7280 | *sd = SD_##type##_INIT; \ |
7203 | sd->level = SD_LV_##type; \ | 7281 | sd->level = SD_LV_##type; \ |
7282 | SD_INIT_NAME(sd, type); \ | ||
7204 | } | 7283 | } |
7205 | 7284 | ||
7206 | SD_INIT_FUNC(CPU) | 7285 | SD_INIT_FUNC(CPU) |
@@ -8242,20 +8321,25 @@ void __might_sleep(char *file, int line) | |||
8242 | #ifdef in_atomic | 8321 | #ifdef in_atomic |
8243 | static unsigned long prev_jiffy; /* ratelimiting */ | 8322 | static unsigned long prev_jiffy; /* ratelimiting */ |
8244 | 8323 | ||
8245 | if ((in_atomic() || irqs_disabled()) && | 8324 | if ((!in_atomic() && !irqs_disabled()) || |
8246 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 8325 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8247 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8326 | return; |
8248 | return; | 8327 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
8249 | prev_jiffy = jiffies; | 8328 | return; |
8250 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 8329 | prev_jiffy = jiffies; |
8251 | " context at %s:%d\n", file, line); | 8330 | |
8252 | printk("in_atomic():%d, irqs_disabled():%d\n", | 8331 | printk(KERN_ERR |
8253 | in_atomic(), irqs_disabled()); | 8332 | "BUG: sleeping function called from invalid context at %s:%d\n", |
8254 | debug_show_held_locks(current); | 8333 | file, line); |
8255 | if (irqs_disabled()) | 8334 | printk(KERN_ERR |
8256 | print_irqtrace_events(current); | 8335 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
8257 | dump_stack(); | 8336 | in_atomic(), irqs_disabled(), |
8258 | } | 8337 | current->pid, current->comm); |
8338 | |||
8339 | debug_show_held_locks(current); | ||
8340 | if (irqs_disabled()) | ||
8341 | print_irqtrace_events(current); | ||
8342 | dump_stack(); | ||
8259 | #endif | 8343 | #endif |
8260 | } | 8344 | } |
8261 | EXPORT_SYMBOL(__might_sleep); | 8345 | EXPORT_SYMBOL(__might_sleep); |
@@ -8753,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); | |||
8753 | static unsigned long to_ratio(u64 period, u64 runtime) | 8837 | static unsigned long to_ratio(u64 period, u64 runtime) |
8754 | { | 8838 | { |
8755 | if (runtime == RUNTIME_INF) | 8839 | if (runtime == RUNTIME_INF) |
8756 | return 1ULL << 16; | 8840 | return 1ULL << 20; |
8757 | 8841 | ||
8758 | return div64_u64(runtime << 16, period); | 8842 | return div64_u64(runtime << 20, period); |
8759 | } | 8843 | } |
8760 | 8844 | ||
8761 | #ifdef CONFIG_CGROUP_SCHED | 8845 | /* Must be called with tasklist_lock held */ |
8762 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8846 | static inline int tg_has_rt_tasks(struct task_group *tg) |
8763 | { | 8847 | { |
8764 | struct task_group *tgi, *parent = tg->parent; | 8848 | struct task_struct *g, *p; |
8765 | unsigned long total = 0; | ||
8766 | 8849 | ||
8767 | if (!parent) { | 8850 | do_each_thread(g, p) { |
8768 | if (global_rt_period() < period) | 8851 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
8769 | return 0; | 8852 | return 1; |
8853 | } while_each_thread(g, p); | ||
8770 | 8854 | ||
8771 | return to_ratio(period, runtime) < | 8855 | return 0; |
8772 | to_ratio(global_rt_period(), global_rt_runtime()); | 8856 | } |
8773 | } | ||
8774 | 8857 | ||
8775 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | 8858 | struct rt_schedulable_data { |
8776 | return 0; | 8859 | struct task_group *tg; |
8860 | u64 rt_period; | ||
8861 | u64 rt_runtime; | ||
8862 | }; | ||
8777 | 8863 | ||
8778 | rcu_read_lock(); | 8864 | static int tg_schedulable(struct task_group *tg, void *data) |
8779 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | 8865 | { |
8780 | if (tgi == tg) | 8866 | struct rt_schedulable_data *d = data; |
8781 | continue; | 8867 | struct task_group *child; |
8868 | unsigned long total, sum = 0; | ||
8869 | u64 period, runtime; | ||
8782 | 8870 | ||
8783 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8871 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); |
8784 | tgi->rt_bandwidth.rt_runtime); | 8872 | runtime = tg->rt_bandwidth.rt_runtime; |
8873 | |||
8874 | if (tg == d->tg) { | ||
8875 | period = d->rt_period; | ||
8876 | runtime = d->rt_runtime; | ||
8785 | } | 8877 | } |
8786 | rcu_read_unlock(); | ||
8787 | 8878 | ||
8788 | return total + to_ratio(period, runtime) <= | 8879 | /* |
8789 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8880 | * Cannot have more runtime than the period. |
8790 | parent->rt_bandwidth.rt_runtime); | 8881 | */ |
8791 | } | 8882 | if (runtime > period && runtime != RUNTIME_INF) |
8792 | #elif defined CONFIG_USER_SCHED | 8883 | return -EINVAL; |
8793 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
8794 | { | ||
8795 | struct task_group *tgi; | ||
8796 | unsigned long total = 0; | ||
8797 | unsigned long global_ratio = | ||
8798 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
8799 | 8884 | ||
8800 | rcu_read_lock(); | 8885 | /* |
8801 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8886 | * Ensure we don't starve existing RT tasks. |
8802 | if (tgi == tg) | 8887 | */ |
8803 | continue; | 8888 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
8889 | return -EBUSY; | ||
8890 | |||
8891 | total = to_ratio(period, runtime); | ||
8892 | |||
8893 | /* | ||
8894 | * Nobody can have more than the global setting allows. | ||
8895 | */ | ||
8896 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
8897 | return -EINVAL; | ||
8898 | |||
8899 | /* | ||
8900 | * The sum of our children's runtime should not exceed our own. | ||
8901 | */ | ||
8902 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
8903 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
8904 | runtime = child->rt_bandwidth.rt_runtime; | ||
8905 | |||
8906 | if (child == d->tg) { | ||
8907 | period = d->rt_period; | ||
8908 | runtime = d->rt_runtime; | ||
8909 | } | ||
8804 | 8910 | ||
8805 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8911 | sum += to_ratio(period, runtime); |
8806 | tgi->rt_bandwidth.rt_runtime); | ||
8807 | } | 8912 | } |
8808 | rcu_read_unlock(); | ||
8809 | 8913 | ||
8810 | return total + to_ratio(period, runtime) < global_ratio; | 8914 | if (sum > total) |
8915 | return -EINVAL; | ||
8916 | |||
8917 | return 0; | ||
8811 | } | 8918 | } |
8812 | #endif | ||
8813 | 8919 | ||
8814 | /* Must be called with tasklist_lock held */ | 8920 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8815 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
8816 | { | 8921 | { |
8817 | struct task_struct *g, *p; | 8922 | struct rt_schedulable_data data = { |
8818 | do_each_thread(g, p) { | 8923 | .tg = tg, |
8819 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8924 | .rt_period = period, |
8820 | return 1; | 8925 | .rt_runtime = runtime, |
8821 | } while_each_thread(g, p); | 8926 | }; |
8822 | return 0; | 8927 | |
8928 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | ||
8823 | } | 8929 | } |
8824 | 8930 | ||
8825 | static int tg_set_bandwidth(struct task_group *tg, | 8931 | static int tg_set_bandwidth(struct task_group *tg, |
@@ -8829,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8829 | 8935 | ||
8830 | mutex_lock(&rt_constraints_mutex); | 8936 | mutex_lock(&rt_constraints_mutex); |
8831 | read_lock(&tasklist_lock); | 8937 | read_lock(&tasklist_lock); |
8832 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { | 8938 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
8833 | err = -EBUSY; | 8939 | if (err) |
8834 | goto unlock; | 8940 | goto unlock; |
8835 | } | ||
8836 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | ||
8837 | err = -EINVAL; | ||
8838 | goto unlock; | ||
8839 | } | ||
8840 | 8941 | ||
8841 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8942 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8842 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8943 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
@@ -8905,19 +9006,25 @@ long sched_group_rt_period(struct task_group *tg) | |||
8905 | 9006 | ||
8906 | static int sched_rt_global_constraints(void) | 9007 | static int sched_rt_global_constraints(void) |
8907 | { | 9008 | { |
8908 | struct task_group *tg = &root_task_group; | 9009 | u64 runtime, period; |
8909 | u64 rt_runtime, rt_period; | ||
8910 | int ret = 0; | 9010 | int ret = 0; |
8911 | 9011 | ||
8912 | if (sysctl_sched_rt_period <= 0) | 9012 | if (sysctl_sched_rt_period <= 0) |
8913 | return -EINVAL; | 9013 | return -EINVAL; |
8914 | 9014 | ||
8915 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 9015 | runtime = global_rt_runtime(); |
8916 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 9016 | period = global_rt_period(); |
9017 | |||
9018 | /* | ||
9019 | * Sanity check on the sysctl variables. | ||
9020 | */ | ||
9021 | if (runtime > period && runtime != RUNTIME_INF) | ||
9022 | return -EINVAL; | ||
8917 | 9023 | ||
8918 | mutex_lock(&rt_constraints_mutex); | 9024 | mutex_lock(&rt_constraints_mutex); |
8919 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) | 9025 | read_lock(&tasklist_lock); |
8920 | ret = -EINVAL; | 9026 | ret = __rt_schedulable(NULL, 0, 0); |
9027 | read_unlock(&tasklist_lock); | ||
8921 | mutex_unlock(&rt_constraints_mutex); | 9028 | mutex_unlock(&rt_constraints_mutex); |
8922 | 9029 | ||
8923 | return ret; | 9030 | return ret; |
@@ -8991,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8991 | 9098 | ||
8992 | if (!cgrp->parent) { | 9099 | if (!cgrp->parent) { |
8993 | /* This is early initialization for the top cgroup */ | 9100 | /* This is early initialization for the top cgroup */ |
8994 | init_task_group.css.cgroup = cgrp; | ||
8995 | return &init_task_group.css; | 9101 | return &init_task_group.css; |
8996 | } | 9102 | } |
8997 | 9103 | ||
@@ -9000,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9000 | if (IS_ERR(tg)) | 9106 | if (IS_ERR(tg)) |
9001 | return ERR_PTR(-ENOMEM); | 9107 | return ERR_PTR(-ENOMEM); |
9002 | 9108 | ||
9003 | /* Bind the cgroup to task_group object we just created */ | ||
9004 | tg->css.cgroup = cgrp; | ||
9005 | |||
9006 | return &tg->css; | 9109 | return &tg->css; |
9007 | } | 9110 | } |
9008 | 9111 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..18fd17172eb6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
409 | } | 409 | } |
410 | 410 | ||
411 | /* | 411 | /* |
412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
413 | * that it favours >=0 over <0. | ||
414 | * | ||
415 | * -20 | | ||
416 | * | | ||
417 | * 0 --------+------- | ||
418 | * .' | ||
419 | * 19 .' | ||
420 | * | ||
421 | */ | ||
422 | static unsigned long | ||
423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
424 | { | ||
425 | struct load_weight lw = { | ||
426 | .weight = NICE_0_LOAD, | ||
427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
428 | }; | ||
429 | |||
430 | for_each_sched_entity(se) { | ||
431 | struct load_weight *se_lw = &se->load; | ||
432 | unsigned long rw = cfs_rq_of(se)->load.weight; | ||
433 | |||
434 | #ifdef CONFIG_FAIR_SCHED_GROUP | ||
435 | struct cfs_rq *cfs_rq = se->my_q; | ||
436 | struct task_group *tg = NULL | ||
437 | |||
438 | if (cfs_rq) | ||
439 | tg = cfs_rq->tg; | ||
440 | |||
441 | if (tg && tg->shares < NICE_0_LOAD) { | ||
442 | /* | ||
443 | * scale shares to what it would have been had | ||
444 | * tg->weight been NICE_0_LOAD: | ||
445 | * | ||
446 | * weight = 1024 * shares / tg->weight | ||
447 | */ | ||
448 | lw.weight *= se->load.weight; | ||
449 | lw.weight /= tg->shares; | ||
450 | |||
451 | lw.inv_weight = 0; | ||
452 | |||
453 | se_lw = &lw; | ||
454 | rw += lw.weight - se->load.weight; | ||
455 | } else | ||
456 | #endif | ||
457 | |||
458 | if (se->load.weight < NICE_0_LOAD) { | ||
459 | se_lw = &lw; | ||
460 | rw += NICE_0_LOAD - se->load.weight; | ||
461 | } | ||
462 | |||
463 | delta = calc_delta_mine(delta, rw, se_lw); | ||
464 | } | ||
465 | |||
466 | return delta; | ||
467 | } | ||
468 | |||
469 | /* | ||
470 | * Update the current task's runtime statistics. Skip current tasks that | 412 | * Update the current task's runtime statistics. Skip current tasks that |
471 | * are not in our scheduling class. | 413 | * are not in our scheduling class. |
472 | */ | 414 | */ |
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
586 | update_load_add(&cfs_rq->load, se->load.weight); | 528 | update_load_add(&cfs_rq->load, se->load.weight); |
587 | if (!parent_entity(se)) | 529 | if (!parent_entity(se)) |
588 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | 530 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); |
589 | if (entity_is_task(se)) | 531 | if (entity_is_task(se)) { |
590 | add_cfs_task_weight(cfs_rq, se->load.weight); | 532 | add_cfs_task_weight(cfs_rq, se->load.weight); |
533 | list_add(&se->group_node, &cfs_rq->tasks); | ||
534 | } | ||
591 | cfs_rq->nr_running++; | 535 | cfs_rq->nr_running++; |
592 | se->on_rq = 1; | 536 | se->on_rq = 1; |
593 | list_add(&se->group_node, &cfs_rq->tasks); | ||
594 | } | 537 | } |
595 | 538 | ||
596 | static void | 539 | static void |
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
599 | update_load_sub(&cfs_rq->load, se->load.weight); | 542 | update_load_sub(&cfs_rq->load, se->load.weight); |
600 | if (!parent_entity(se)) | 543 | if (!parent_entity(se)) |
601 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | 544 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); |
602 | if (entity_is_task(se)) | 545 | if (entity_is_task(se)) { |
603 | add_cfs_task_weight(cfs_rq, -se->load.weight); | 546 | add_cfs_task_weight(cfs_rq, -se->load.weight); |
547 | list_del_init(&se->group_node); | ||
548 | } | ||
604 | cfs_rq->nr_running--; | 549 | cfs_rq->nr_running--; |
605 | se->on_rq = 0; | 550 | se->on_rq = 0; |
606 | list_del_init(&se->group_node); | ||
607 | } | 551 | } |
608 | 552 | ||
609 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 553 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu, | |||
1085 | long wl, long wg) | 1029 | long wl, long wg) |
1086 | { | 1030 | { |
1087 | struct sched_entity *se = tg->se[cpu]; | 1031 | struct sched_entity *se = tg->se[cpu]; |
1088 | long more_w; | ||
1089 | 1032 | ||
1090 | if (!tg->parent) | 1033 | if (!tg->parent) |
1091 | return wl; | 1034 | return wl; |
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu, | |||
1097 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | 1040 | if (!wl && sched_feat(ASYM_EFF_LOAD)) |
1098 | return wl; | 1041 | return wl; |
1099 | 1042 | ||
1100 | /* | ||
1101 | * Instead of using this increment, also add the difference | ||
1102 | * between when the shares were last updated and now. | ||
1103 | */ | ||
1104 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1105 | wl += more_w; | ||
1106 | wg += more_w; | ||
1107 | |||
1108 | for_each_sched_entity(se) { | 1043 | for_each_sched_entity(se) { |
1109 | #define D(n) (likely(n) ? (n) : 1) | ||
1110 | |||
1111 | long S, rw, s, a, b; | 1044 | long S, rw, s, a, b; |
1045 | long more_w; | ||
1046 | |||
1047 | /* | ||
1048 | * Instead of using this increment, also add the difference | ||
1049 | * between when the shares were last updated and now. | ||
1050 | */ | ||
1051 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1052 | wl += more_w; | ||
1053 | wg += more_w; | ||
1112 | 1054 | ||
1113 | S = se->my_q->tg->shares; | 1055 | S = se->my_q->tg->shares; |
1114 | s = se->my_q->shares; | 1056 | s = se->my_q->shares; |
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu, | |||
1117 | a = S*(rw + wl); | 1059 | a = S*(rw + wl); |
1118 | b = S*rw + s*wg; | 1060 | b = S*rw + s*wg; |
1119 | 1061 | ||
1120 | wl = s*(a-b)/D(b); | 1062 | wl = s*(a-b); |
1063 | |||
1064 | if (likely(b)) | ||
1065 | wl /= b; | ||
1066 | |||
1121 | /* | 1067 | /* |
1122 | * Assume the group is already running and will | 1068 | * Assume the group is already running and will |
1123 | * thus already be accounted for in the weight. | 1069 | * thus already be accounted for in the weight. |
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu, | |||
1126 | * alter the group weight. | 1072 | * alter the group weight. |
1127 | */ | 1073 | */ |
1128 | wg = 0; | 1074 | wg = 0; |
1129 | #undef D | ||
1130 | } | 1075 | } |
1131 | 1076 | ||
1132 | return wl; | 1077 | return wl; |
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
1143 | #endif | 1088 | #endif |
1144 | 1089 | ||
1145 | static int | 1090 | static int |
1146 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | 1091 | wake_affine(struct sched_domain *this_sd, struct rq *this_rq, |
1147 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | 1092 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, |
1148 | int idx, unsigned long load, unsigned long this_load, | 1093 | int idx, unsigned long load, unsigned long this_load, |
1149 | unsigned int imbalance) | 1094 | unsigned int imbalance) |
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
1158 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) | 1103 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) |
1159 | return 0; | 1104 | return 0; |
1160 | 1105 | ||
1106 | if (!sync && sched_feat(SYNC_WAKEUPS) && | ||
1107 | curr->se.avg_overlap < sysctl_sched_migration_cost && | ||
1108 | p->se.avg_overlap < sysctl_sched_migration_cost) | ||
1109 | sync = 1; | ||
1110 | |||
1161 | /* | 1111 | /* |
1162 | * If sync wakeup then subtract the (maximum possible) | 1112 | * If sync wakeup then subtract the (maximum possible) |
1163 | * effect of the currently running task from the load | 1113 | * effect of the currently running task from the load |
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
1182 | * a reasonable amount of time then attract this newly | 1132 | * a reasonable amount of time then attract this newly |
1183 | * woken task: | 1133 | * woken task: |
1184 | */ | 1134 | */ |
1185 | if (sync && balanced) { | 1135 | if (sync && balanced) |
1186 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | 1136 | return 1; |
1187 | p->se.avg_overlap < sysctl_sched_migration_cost) | ||
1188 | return 1; | ||
1189 | } | ||
1190 | 1137 | ||
1191 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | 1138 | schedstat_inc(p, se.nr_wakeups_affine_attempts); |
1192 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1139 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
1193 | 1140 | ||
1194 | if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || | 1141 | if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= |
1195 | balanced) { | 1142 | tl_per_task)) { |
1196 | /* | 1143 | /* |
1197 | * This domain has SD_WAKE_AFFINE and | 1144 | * This domain has SD_WAKE_AFFINE and |
1198 | * p is cache cold in this domain, and | 1145 | * p is cache cold in this domain, and |
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) | |||
1211 | struct sched_domain *sd, *this_sd = NULL; | 1158 | struct sched_domain *sd, *this_sd = NULL; |
1212 | int prev_cpu, this_cpu, new_cpu; | 1159 | int prev_cpu, this_cpu, new_cpu; |
1213 | unsigned long load, this_load; | 1160 | unsigned long load, this_load; |
1214 | struct rq *rq, *this_rq; | 1161 | struct rq *this_rq; |
1215 | unsigned int imbalance; | 1162 | unsigned int imbalance; |
1216 | int idx; | 1163 | int idx; |
1217 | 1164 | ||
1218 | prev_cpu = task_cpu(p); | 1165 | prev_cpu = task_cpu(p); |
1219 | rq = task_rq(p); | ||
1220 | this_cpu = smp_processor_id(); | 1166 | this_cpu = smp_processor_id(); |
1221 | this_rq = cpu_rq(this_cpu); | 1167 | this_rq = cpu_rq(this_cpu); |
1222 | new_cpu = prev_cpu; | 1168 | new_cpu = prev_cpu; |
1223 | 1169 | ||
1170 | if (prev_cpu == this_cpu) | ||
1171 | goto out; | ||
1224 | /* | 1172 | /* |
1225 | * 'this_sd' is the first domain that both | 1173 | * 'this_sd' is the first domain that both |
1226 | * this_cpu and prev_cpu are present in: | 1174 | * this_cpu and prev_cpu are present in: |
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) | |||
1248 | load = source_load(prev_cpu, idx); | 1196 | load = source_load(prev_cpu, idx); |
1249 | this_load = target_load(this_cpu, idx); | 1197 | this_load = target_load(this_cpu, idx); |
1250 | 1198 | ||
1251 | if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, | 1199 | if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, |
1252 | load, this_load, imbalance)) | 1200 | load, this_load, imbalance)) |
1253 | return this_cpu; | 1201 | return this_cpu; |
1254 | 1202 | ||
1255 | if (prev_cpu == this_cpu) | ||
1256 | goto out; | ||
1257 | |||
1258 | /* | 1203 | /* |
1259 | * Start passive balancing when half the imbalance_pct | 1204 | * Start passive balancing when half the imbalance_pct |
1260 | * limit is reached. | 1205 | * limit is reached. |
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se) | |||
1281 | * + nice tasks. | 1226 | * + nice tasks. |
1282 | */ | 1227 | */ |
1283 | if (sched_feat(ASYM_GRAN)) | 1228 | if (sched_feat(ASYM_GRAN)) |
1284 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | 1229 | gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); |
1285 | else | ||
1286 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); | ||
1287 | 1230 | ||
1288 | return gran; | 1231 | return gran; |
1289 | } | 1232 | } |
1290 | 1233 | ||
1291 | /* | 1234 | /* |
1292 | * Should 'se' preempt 'curr'. | ||
1293 | * | ||
1294 | * |s1 | ||
1295 | * |s2 | ||
1296 | * |s3 | ||
1297 | * g | ||
1298 | * |<--->|c | ||
1299 | * | ||
1300 | * w(c, s1) = -1 | ||
1301 | * w(c, s2) = 0 | ||
1302 | * w(c, s3) = 1 | ||
1303 | * | ||
1304 | */ | ||
1305 | static int | ||
1306 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
1307 | { | ||
1308 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
1309 | |||
1310 | if (vdiff < 0) | ||
1311 | return -1; | ||
1312 | |||
1313 | gran = wakeup_gran(curr); | ||
1314 | if (vdiff > gran) | ||
1315 | return 1; | ||
1316 | |||
1317 | return 0; | ||
1318 | } | ||
1319 | |||
1320 | /* return depth at which a sched entity is present in the hierarchy */ | ||
1321 | static inline int depth_se(struct sched_entity *se) | ||
1322 | { | ||
1323 | int depth = 0; | ||
1324 | |||
1325 | for_each_sched_entity(se) | ||
1326 | depth++; | ||
1327 | |||
1328 | return depth; | ||
1329 | } | ||
1330 | |||
1331 | /* | ||
1332 | * Preempt the current task with a newly woken task if needed: | 1235 | * Preempt the current task with a newly woken task if needed: |
1333 | */ | 1236 | */ |
1334 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1237 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) |
1335 | { | 1238 | { |
1336 | struct task_struct *curr = rq->curr; | 1239 | struct task_struct *curr = rq->curr; |
1337 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1240 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1338 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1241 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1339 | int se_depth, pse_depth; | 1242 | s64 delta_exec; |
1340 | 1243 | ||
1341 | if (unlikely(rt_prio(p->prio))) { | 1244 | if (unlikely(rt_prio(p->prio))) { |
1342 | update_rq_clock(rq); | 1245 | update_rq_clock(rq); |
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1351 | cfs_rq_of(pse)->next = pse; | 1254 | cfs_rq_of(pse)->next = pse; |
1352 | 1255 | ||
1353 | /* | 1256 | /* |
1257 | * We can come here with TIF_NEED_RESCHED already set from new task | ||
1258 | * wake up path. | ||
1259 | */ | ||
1260 | if (test_tsk_need_resched(curr)) | ||
1261 | return; | ||
1262 | |||
1263 | /* | ||
1354 | * Batch tasks do not preempt (their preemption is driven by | 1264 | * Batch tasks do not preempt (their preemption is driven by |
1355 | * the tick): | 1265 | * the tick): |
1356 | */ | 1266 | */ |
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1360 | if (!sched_feat(WAKEUP_PREEMPT)) | 1270 | if (!sched_feat(WAKEUP_PREEMPT)) |
1361 | return; | 1271 | return; |
1362 | 1272 | ||
1363 | /* | 1273 | if (sched_feat(WAKEUP_OVERLAP) && (sync || |
1364 | * preemption test can be made between sibling entities who are in the | 1274 | (se->avg_overlap < sysctl_sched_migration_cost && |
1365 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | 1275 | pse->avg_overlap < sysctl_sched_migration_cost))) { |
1366 | * both tasks until we find their ancestors who are siblings of common | 1276 | resched_task(curr); |
1367 | * parent. | 1277 | return; |
1368 | */ | ||
1369 | |||
1370 | /* First walk up until both entities are at same depth */ | ||
1371 | se_depth = depth_se(se); | ||
1372 | pse_depth = depth_se(pse); | ||
1373 | |||
1374 | while (se_depth > pse_depth) { | ||
1375 | se_depth--; | ||
1376 | se = parent_entity(se); | ||
1377 | } | ||
1378 | |||
1379 | while (pse_depth > se_depth) { | ||
1380 | pse_depth--; | ||
1381 | pse = parent_entity(pse); | ||
1382 | } | ||
1383 | |||
1384 | while (!is_same_group(se, pse)) { | ||
1385 | se = parent_entity(se); | ||
1386 | pse = parent_entity(pse); | ||
1387 | } | 1278 | } |
1388 | 1279 | ||
1389 | if (wakeup_preempt_entity(se, pse) == 1) | 1280 | delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
1281 | if (delta_exec > wakeup_gran(pse)) | ||
1390 | resched_task(curr); | 1282 | resched_task(curr); |
1391 | } | 1283 | } |
1392 | 1284 | ||
@@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | |||
1445 | if (next == &cfs_rq->tasks) | 1337 | if (next == &cfs_rq->tasks) |
1446 | return NULL; | 1338 | return NULL; |
1447 | 1339 | ||
1448 | /* Skip over entities that are not tasks */ | 1340 | se = list_entry(next, struct sched_entity, group_node); |
1449 | do { | 1341 | p = task_of(se); |
1450 | se = list_entry(next, struct sched_entity, group_node); | 1342 | cfs_rq->balance_iterator = next->next; |
1451 | next = next->next; | ||
1452 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
1453 | |||
1454 | if (next == &cfs_rq->tasks) | ||
1455 | return NULL; | ||
1456 | |||
1457 | cfs_rq->balance_iterator = next; | ||
1458 | |||
1459 | if (entity_is_task(se)) | ||
1460 | p = task_of(se); | ||
1461 | 1343 | ||
1462 | return p; | 1344 | return p; |
1463 | } | 1345 | } |
@@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1507 | rcu_read_lock(); | 1389 | rcu_read_lock(); |
1508 | update_h_load(busiest_cpu); | 1390 | update_h_load(busiest_cpu); |
1509 | 1391 | ||
1510 | list_for_each_entry(tg, &task_groups, list) { | 1392 | list_for_each_entry_rcu(tg, &task_groups, list) { |
1511 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | 1393 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; |
1512 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | 1394 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; |
1513 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 1395 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
@@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1620 | * 'current' within the tree based on its new key value. | 1502 | * 'current' within the tree based on its new key value. |
1621 | */ | 1503 | */ |
1622 | swap(curr->vruntime, se->vruntime); | 1504 | swap(curr->vruntime, se->vruntime); |
1505 | resched_task(rq->curr); | ||
1623 | } | 1506 | } |
1624 | 1507 | ||
1625 | enqueue_task_fair(rq, p, 0); | 1508 | enqueue_task_fair(rq, p, 0); |
1626 | resched_task(rq->curr); | ||
1627 | } | 1509 | } |
1628 | 1510 | ||
1629 | /* | 1511 | /* |
@@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, | |||
1642 | if (p->prio > oldprio) | 1524 | if (p->prio > oldprio) |
1643 | resched_task(rq->curr); | 1525 | resched_task(rq->curr); |
1644 | } else | 1526 | } else |
1645 | check_preempt_curr(rq, p); | 1527 | check_preempt_curr(rq, p, 0); |
1646 | } | 1528 | } |
1647 | 1529 | ||
1648 | /* | 1530 | /* |
@@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, | |||
1659 | if (running) | 1541 | if (running) |
1660 | resched_task(rq->curr); | 1542 | resched_task(rq->curr); |
1661 | else | 1543 | else |
1662 | check_preempt_curr(rq, p); | 1544 | check_preempt_curr(rq, p, 0); |
1663 | } | 1545 | } |
1664 | 1546 | ||
1665 | /* Account for a task changing its policy or group. | 1547 | /* Account for a task changing its policy or group. |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154e..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) | |||
11 | SCHED_FEAT(LB_BIAS, 1) | 11 | SCHED_FEAT(LB_BIAS, 1) |
12 | SCHED_FEAT(LB_WAKEUP_UPDATE, 1) | 12 | SCHED_FEAT(LB_WAKEUP_UPDATE, 1) |
13 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | 13 | SCHED_FEAT(ASYM_EFF_LOAD, 1) |
14 | SCHED_FEAT(WAKEUP_OVERLAP, 0) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) | |||
14 | /* | 14 | /* |
15 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
16 | */ | 16 | */ |
17 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) | 17 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) |
18 | { | 18 | { |
19 | resched_task(rq->idle); | 19 | resched_task(rq->idle); |
20 | } | 20 | } |
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, | |||
76 | if (running) | 76 | if (running) |
77 | resched_task(rq->curr); | 77 | resched_task(rq->curr); |
78 | else | 78 | else |
79 | check_preempt_curr(rq, p); | 79 | check_preempt_curr(rq, p, 0); |
80 | } | 80 | } |
81 | 81 | ||
82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | 82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, |
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, | |||
93 | if (p->prio > oldprio) | 93 | if (p->prio > oldprio) |
94 | resched_task(rq->curr); | 94 | resched_task(rq->curr); |
95 | } else | 95 | } else |
96 | check_preempt_curr(rq, p); | 96 | check_preempt_curr(rq, p, 0); |
97 | } | 97 | } |
98 | 98 | ||
99 | /* | 99 | /* |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1113157b2058..cdf5740ab03e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
102 | 102 | ||
103 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 103 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
104 | { | 104 | { |
105 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
105 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 106 | struct sched_rt_entity *rt_se = rt_rq->rt_se; |
106 | 107 | ||
107 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | 108 | if (rt_rq->rt_nr_running) { |
108 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 109 | if (rt_se && !on_rt_rq(rt_se)) |
109 | 110 | enqueue_rt_entity(rt_se); | |
110 | enqueue_rt_entity(rt_se); | ||
111 | if (rt_rq->highest_prio < curr->prio) | 111 | if (rt_rq->highest_prio < curr->prio) |
112 | resched_task(curr); | 112 | resched_task(curr); |
113 | } | 113 | } |
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
231 | #endif /* CONFIG_RT_GROUP_SCHED */ | 231 | #endif /* CONFIG_RT_GROUP_SCHED */ |
232 | 232 | ||
233 | #ifdef CONFIG_SMP | 233 | #ifdef CONFIG_SMP |
234 | /* | ||
235 | * We ran out of runtime, see if we can borrow some from our neighbours. | ||
236 | */ | ||
234 | static int do_balance_runtime(struct rt_rq *rt_rq) | 237 | static int do_balance_runtime(struct rt_rq *rt_rq) |
235 | { | 238 | { |
236 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 239 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) | |||
250 | continue; | 253 | continue; |
251 | 254 | ||
252 | spin_lock(&iter->rt_runtime_lock); | 255 | spin_lock(&iter->rt_runtime_lock); |
256 | /* | ||
257 | * Either all rqs have inf runtime and there's nothing to steal | ||
258 | * or __disable_runtime() below sets a specific rq to inf to | ||
259 | * indicate its been disabled and disalow stealing. | ||
260 | */ | ||
253 | if (iter->rt_runtime == RUNTIME_INF) | 261 | if (iter->rt_runtime == RUNTIME_INF) |
254 | goto next; | 262 | goto next; |
255 | 263 | ||
264 | /* | ||
265 | * From runqueues with spare time, take 1/n part of their | ||
266 | * spare time, but no more than our period. | ||
267 | */ | ||
256 | diff = iter->rt_runtime - iter->rt_time; | 268 | diff = iter->rt_runtime - iter->rt_time; |
257 | if (diff > 0) { | 269 | if (diff > 0) { |
258 | diff = div_u64((u64)diff, weight); | 270 | diff = div_u64((u64)diff, weight); |
@@ -274,6 +286,9 @@ next: | |||
274 | return more; | 286 | return more; |
275 | } | 287 | } |
276 | 288 | ||
289 | /* | ||
290 | * Ensure this RQ takes back all the runtime it lend to its neighbours. | ||
291 | */ | ||
277 | static void __disable_runtime(struct rq *rq) | 292 | static void __disable_runtime(struct rq *rq) |
278 | { | 293 | { |
279 | struct root_domain *rd = rq->rd; | 294 | struct root_domain *rd = rq->rd; |
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) | |||
289 | 304 | ||
290 | spin_lock(&rt_b->rt_runtime_lock); | 305 | spin_lock(&rt_b->rt_runtime_lock); |
291 | spin_lock(&rt_rq->rt_runtime_lock); | 306 | spin_lock(&rt_rq->rt_runtime_lock); |
307 | /* | ||
308 | * Either we're all inf and nobody needs to borrow, or we're | ||
309 | * already disabled and thus have nothing to do, or we have | ||
310 | * exactly the right amount of runtime to take out. | ||
311 | */ | ||
292 | if (rt_rq->rt_runtime == RUNTIME_INF || | 312 | if (rt_rq->rt_runtime == RUNTIME_INF || |
293 | rt_rq->rt_runtime == rt_b->rt_runtime) | 313 | rt_rq->rt_runtime == rt_b->rt_runtime) |
294 | goto balanced; | 314 | goto balanced; |
295 | spin_unlock(&rt_rq->rt_runtime_lock); | 315 | spin_unlock(&rt_rq->rt_runtime_lock); |
296 | 316 | ||
317 | /* | ||
318 | * Calculate the difference between what we started out with | ||
319 | * and what we current have, that's the amount of runtime | ||
320 | * we lend and now have to reclaim. | ||
321 | */ | ||
297 | want = rt_b->rt_runtime - rt_rq->rt_runtime; | 322 | want = rt_b->rt_runtime - rt_rq->rt_runtime; |
298 | 323 | ||
324 | /* | ||
325 | * Greedy reclaim, take back as much as we can. | ||
326 | */ | ||
299 | for_each_cpu_mask(i, rd->span) { | 327 | for_each_cpu_mask(i, rd->span) { |
300 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | 328 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); |
301 | s64 diff; | 329 | s64 diff; |
302 | 330 | ||
331 | /* | ||
332 | * Can't reclaim from ourselves or disabled runqueues. | ||
333 | */ | ||
303 | if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) | 334 | if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) |
304 | continue; | 335 | continue; |
305 | 336 | ||
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) | |||
319 | } | 350 | } |
320 | 351 | ||
321 | spin_lock(&rt_rq->rt_runtime_lock); | 352 | spin_lock(&rt_rq->rt_runtime_lock); |
353 | /* | ||
354 | * We cannot be left wanting - that would mean some runtime | ||
355 | * leaked out of the system. | ||
356 | */ | ||
322 | BUG_ON(want); | 357 | BUG_ON(want); |
323 | balanced: | 358 | balanced: |
359 | /* | ||
360 | * Disable all the borrow logic by pretending we have inf | ||
361 | * runtime - in which case borrowing doesn't make sense. | ||
362 | */ | ||
324 | rt_rq->rt_runtime = RUNTIME_INF; | 363 | rt_rq->rt_runtime = RUNTIME_INF; |
325 | spin_unlock(&rt_rq->rt_runtime_lock); | 364 | spin_unlock(&rt_rq->rt_runtime_lock); |
326 | spin_unlock(&rt_b->rt_runtime_lock); | 365 | spin_unlock(&rt_b->rt_runtime_lock); |
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) | |||
343 | if (unlikely(!scheduler_running)) | 382 | if (unlikely(!scheduler_running)) |
344 | return; | 383 | return; |
345 | 384 | ||
385 | /* | ||
386 | * Reset each runqueue's bandwidth settings | ||
387 | */ | ||
346 | for_each_leaf_rt_rq(rt_rq, rq) { | 388 | for_each_leaf_rt_rq(rt_rq, rq) { |
347 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 389 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
348 | 390 | ||
@@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
389 | int i, idle = 1; | 431 | int i, idle = 1; |
390 | cpumask_t span; | 432 | cpumask_t span; |
391 | 433 | ||
392 | if (rt_b->rt_runtime == RUNTIME_INF) | 434 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
393 | return 1; | 435 | return 1; |
394 | 436 | ||
395 | span = sched_rt_period_mask(); | 437 | span = sched_rt_period_mask(); |
@@ -487,6 +529,9 @@ static void update_curr_rt(struct rq *rq) | |||
487 | curr->se.exec_start = rq->clock; | 529 | curr->se.exec_start = rq->clock; |
488 | cpuacct_charge(curr, delta_exec); | 530 | cpuacct_charge(curr, delta_exec); |
489 | 531 | ||
532 | if (!rt_bandwidth_enabled()) | ||
533 | return; | ||
534 | |||
490 | for_each_sched_rt_entity(rt_se) { | 535 | for_each_sched_rt_entity(rt_se) { |
491 | rt_rq = rt_rq_of_se(rt_se); | 536 | rt_rq = rt_rq_of_se(rt_se); |
492 | 537 | ||
@@ -784,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
784 | /* | 829 | /* |
785 | * Preempt the current task with a newly woken task if needed: | 830 | * Preempt the current task with a newly woken task if needed: |
786 | */ | 831 | */ |
787 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | 832 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) |
788 | { | 833 | { |
789 | if (p->prio < rq->curr->prio) { | 834 | if (p->prio < rq->curr->prio) { |
790 | resched_task(rq->curr); | 835 | resched_task(rq->curr); |
diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, | |||
169 | { | 169 | { |
170 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | 170 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
171 | 171 | ||
172 | return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); | 172 | return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); |
173 | } | 173 | } |
174 | 174 | ||
175 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | 175 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, |
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | |||
180 | unsigned long rt_runtime; | 180 | unsigned long rt_runtime; |
181 | int rc; | 181 | int rc; |
182 | 182 | ||
183 | sscanf(buf, "%lu", &rt_runtime); | 183 | sscanf(buf, "%ld", &rt_runtime); |
184 | 184 | ||
185 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); | 185 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); |
186 | 186 | ||