diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 544 |
1 files changed, 386 insertions, 158 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 04160d277e7a..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
201 | hrtimer_init(&rt_b->rt_period_timer, | 201 | hrtimer_init(&rt_b->rt_period_timer, |
202 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 202 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
203 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 203 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; |
205 | } | ||
206 | |||
207 | static inline int rt_bandwidth_enabled(void) | ||
208 | { | ||
209 | return sysctl_sched_rt_runtime >= 0; | ||
205 | } | 210 | } |
206 | 211 | ||
207 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 212 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
208 | { | 213 | { |
209 | ktime_t now; | 214 | ktime_t now; |
210 | 215 | ||
211 | if (rt_b->rt_runtime == RUNTIME_INF) | 216 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) |
212 | return; | 217 | return; |
213 | 218 | ||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | 219 | if (hrtimer_active(&rt_b->rt_period_timer)) |
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 303 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 304 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
300 | #endif /* CONFIG_RT_GROUP_SCHED */ | 305 | #endif /* CONFIG_RT_GROUP_SCHED */ |
301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | 306 | #else /* !CONFIG_USER_SCHED */ |
302 | #define root_task_group init_task_group | 307 | #define root_task_group init_task_group |
303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 308 | #endif /* CONFIG_USER_SCHED */ |
304 | 309 | ||
305 | /* task_group_lock serializes add/remove of task groups and also changes to | 310 | /* task_group_lock serializes add/remove of task groups and also changes to |
306 | * a task group's cpu shares. | 311 | * a task group's cpu shares. |
@@ -600,14 +605,13 @@ struct rq { | |||
600 | /* BKL stats */ | 605 | /* BKL stats */ |
601 | unsigned int bkl_count; | 606 | unsigned int bkl_count; |
602 | #endif | 607 | #endif |
603 | struct lock_class_key rq_lock_key; | ||
604 | }; | 608 | }; |
605 | 609 | ||
606 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 610 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
607 | 611 | ||
608 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 612 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) |
609 | { | 613 | { |
610 | rq->curr->sched_class->check_preempt_curr(rq, p); | 614 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); |
611 | } | 615 | } |
612 | 616 | ||
613 | static inline int cpu_of(struct rq *rq) | 617 | static inline int cpu_of(struct rq *rq) |
@@ -809,9 +813,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
809 | 813 | ||
810 | /* | 814 | /* |
811 | * ratelimit for updating the group shares. | 815 | * ratelimit for updating the group shares. |
812 | * default: 0.5ms | 816 | * default: 0.25ms |
813 | */ | 817 | */ |
814 | const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; | 818 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
815 | 819 | ||
816 | /* | 820 | /* |
817 | * period over which we measure -rt task cpu usage in us. | 821 | * period over which we measure -rt task cpu usage in us. |
@@ -834,7 +838,7 @@ static inline u64 global_rt_period(void) | |||
834 | 838 | ||
835 | static inline u64 global_rt_runtime(void) | 839 | static inline u64 global_rt_runtime(void) |
836 | { | 840 | { |
837 | if (sysctl_sched_rt_period < 0) | 841 | if (sysctl_sched_rt_runtime < 0) |
838 | return RUNTIME_INF; | 842 | return RUNTIME_INF; |
839 | 843 | ||
840 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 844 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
@@ -1088,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
1088 | return NOTIFY_DONE; | 1092 | return NOTIFY_DONE; |
1089 | } | 1093 | } |
1090 | 1094 | ||
1091 | static void init_hrtick(void) | 1095 | static __init void init_hrtick(void) |
1092 | { | 1096 | { |
1093 | hotcpu_notifier(hotplug_hrtick, 0); | 1097 | hotcpu_notifier(hotplug_hrtick, 0); |
1094 | } | 1098 | } |
@@ -1103,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
1103 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); | 1107 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); |
1104 | } | 1108 | } |
1105 | 1109 | ||
1106 | static void init_hrtick(void) | 1110 | static inline void init_hrtick(void) |
1107 | { | 1111 | { |
1108 | } | 1112 | } |
1109 | #endif /* CONFIG_SMP */ | 1113 | #endif /* CONFIG_SMP */ |
@@ -1120,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq) | |||
1120 | 1124 | ||
1121 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1125 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1122 | rq->hrtick_timer.function = hrtick; | 1126 | rq->hrtick_timer.function = hrtick; |
1123 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 1127 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; |
1124 | } | 1128 | } |
1125 | #else | 1129 | #else /* CONFIG_SCHED_HRTICK */ |
1126 | static inline void hrtick_clear(struct rq *rq) | 1130 | static inline void hrtick_clear(struct rq *rq) |
1127 | { | 1131 | { |
1128 | } | 1132 | } |
@@ -1134,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) | |||
1134 | static inline void init_hrtick(void) | 1138 | static inline void init_hrtick(void) |
1135 | { | 1139 | { |
1136 | } | 1140 | } |
1137 | #endif | 1141 | #endif /* CONFIG_SCHED_HRTICK */ |
1138 | 1142 | ||
1139 | /* | 1143 | /* |
1140 | * resched_task - mark a task 'to be rescheduled now'. | 1144 | * resched_task - mark a task 'to be rescheduled now'. |
@@ -1381,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1381 | update_load_sub(&rq->load, load); | 1385 | update_load_sub(&rq->load, load); |
1382 | } | 1386 | } |
1383 | 1387 | ||
1384 | #ifdef CONFIG_SMP | 1388 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) |
1385 | static unsigned long source_load(int cpu, int type); | 1389 | typedef int (*tg_visitor)(struct task_group *, void *); |
1386 | static unsigned long target_load(int cpu, int type); | ||
1387 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1388 | |||
1389 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1390 | { | ||
1391 | struct rq *rq = cpu_rq(cpu); | ||
1392 | |||
1393 | if (rq->nr_running) | ||
1394 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1395 | |||
1396 | return rq->avg_load_per_task; | ||
1397 | } | ||
1398 | |||
1399 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1400 | |||
1401 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
1402 | 1390 | ||
1403 | /* | 1391 | /* |
1404 | * Iterate the full tree, calling @down when first entering a node and @up when | 1392 | * Iterate the full tree, calling @down when first entering a node and @up when |
1405 | * leaving it for the final time. | 1393 | * leaving it for the final time. |
1406 | */ | 1394 | */ |
1407 | static void | 1395 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) |
1408 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
1409 | { | 1396 | { |
1410 | struct task_group *parent, *child; | 1397 | struct task_group *parent, *child; |
1398 | int ret; | ||
1411 | 1399 | ||
1412 | rcu_read_lock(); | 1400 | rcu_read_lock(); |
1413 | parent = &root_task_group; | 1401 | parent = &root_task_group; |
1414 | down: | 1402 | down: |
1415 | (*down)(parent, cpu, sd); | 1403 | ret = (*down)(parent, data); |
1404 | if (ret) | ||
1405 | goto out_unlock; | ||
1416 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1406 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1417 | parent = child; | 1407 | parent = child; |
1418 | goto down; | 1408 | goto down; |
@@ -1420,14 +1410,42 @@ down: | |||
1420 | up: | 1410 | up: |
1421 | continue; | 1411 | continue; |
1422 | } | 1412 | } |
1423 | (*up)(parent, cpu, sd); | 1413 | ret = (*up)(parent, data); |
1414 | if (ret) | ||
1415 | goto out_unlock; | ||
1424 | 1416 | ||
1425 | child = parent; | 1417 | child = parent; |
1426 | parent = parent->parent; | 1418 | parent = parent->parent; |
1427 | if (parent) | 1419 | if (parent) |
1428 | goto up; | 1420 | goto up; |
1421 | out_unlock: | ||
1429 | rcu_read_unlock(); | 1422 | rcu_read_unlock(); |
1423 | |||
1424 | return ret; | ||
1425 | } | ||
1426 | |||
1427 | static int tg_nop(struct task_group *tg, void *data) | ||
1428 | { | ||
1429 | return 0; | ||
1430 | } | 1430 | } |
1431 | #endif | ||
1432 | |||
1433 | #ifdef CONFIG_SMP | ||
1434 | static unsigned long source_load(int cpu, int type); | ||
1435 | static unsigned long target_load(int cpu, int type); | ||
1436 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1437 | |||
1438 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1439 | { | ||
1440 | struct rq *rq = cpu_rq(cpu); | ||
1441 | |||
1442 | if (rq->nr_running) | ||
1443 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1444 | |||
1445 | return rq->avg_load_per_task; | ||
1446 | } | ||
1447 | |||
1448 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1431 | 1449 | ||
1432 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1450 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1433 | 1451 | ||
@@ -1487,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1487 | * This needs to be done in a bottom-up fashion because the rq weight of a | 1505 | * This needs to be done in a bottom-up fashion because the rq weight of a |
1488 | * parent group depends on the shares of its child groups. | 1506 | * parent group depends on the shares of its child groups. |
1489 | */ | 1507 | */ |
1490 | static void | 1508 | static int tg_shares_up(struct task_group *tg, void *data) |
1491 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1492 | { | 1509 | { |
1493 | unsigned long rq_weight = 0; | 1510 | unsigned long rq_weight = 0; |
1494 | unsigned long shares = 0; | 1511 | unsigned long shares = 0; |
1512 | struct sched_domain *sd = data; | ||
1495 | int i; | 1513 | int i; |
1496 | 1514 | ||
1497 | for_each_cpu_mask(i, sd->span) { | 1515 | for_each_cpu_mask(i, sd->span) { |
@@ -1516,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1516 | __update_group_shares_cpu(tg, i, shares, rq_weight); | 1534 | __update_group_shares_cpu(tg, i, shares, rq_weight); |
1517 | spin_unlock_irqrestore(&rq->lock, flags); | 1535 | spin_unlock_irqrestore(&rq->lock, flags); |
1518 | } | 1536 | } |
1537 | |||
1538 | return 0; | ||
1519 | } | 1539 | } |
1520 | 1540 | ||
1521 | /* | 1541 | /* |
@@ -1523,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1523 | * This needs to be done in a top-down fashion because the load of a child | 1543 | * This needs to be done in a top-down fashion because the load of a child |
1524 | * group is a fraction of its parents load. | 1544 | * group is a fraction of its parents load. |
1525 | */ | 1545 | */ |
1526 | static void | 1546 | static int tg_load_down(struct task_group *tg, void *data) |
1527 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1528 | { | 1547 | { |
1529 | unsigned long load; | 1548 | unsigned long load; |
1549 | long cpu = (long)data; | ||
1530 | 1550 | ||
1531 | if (!tg->parent) { | 1551 | if (!tg->parent) { |
1532 | load = cpu_rq(cpu)->load.weight; | 1552 | load = cpu_rq(cpu)->load.weight; |
@@ -1537,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
1537 | } | 1557 | } |
1538 | 1558 | ||
1539 | tg->cfs_rq[cpu]->h_load = load; | 1559 | tg->cfs_rq[cpu]->h_load = load; |
1540 | } | ||
1541 | 1560 | ||
1542 | static void | 1561 | return 0; |
1543 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1544 | { | ||
1545 | } | 1562 | } |
1546 | 1563 | ||
1547 | static void update_shares(struct sched_domain *sd) | 1564 | static void update_shares(struct sched_domain *sd) |
@@ -1551,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) | |||
1551 | 1568 | ||
1552 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1569 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1553 | sd->last_update = now; | 1570 | sd->last_update = now; |
1554 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | 1571 | walk_tg_tree(tg_nop, tg_shares_up, sd); |
1555 | } | 1572 | } |
1556 | } | 1573 | } |
1557 | 1574 | ||
@@ -1562,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1562 | spin_lock(&rq->lock); | 1579 | spin_lock(&rq->lock); |
1563 | } | 1580 | } |
1564 | 1581 | ||
1565 | static void update_h_load(int cpu) | 1582 | static void update_h_load(long cpu) |
1566 | { | 1583 | { |
1567 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | 1584 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1568 | } | 1585 | } |
1569 | 1586 | ||
1570 | #else | 1587 | #else |
@@ -1922,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1922 | running = task_running(rq, p); | 1939 | running = task_running(rq, p); |
1923 | on_rq = p->se.on_rq; | 1940 | on_rq = p->se.on_rq; |
1924 | ncsw = 0; | 1941 | ncsw = 0; |
1925 | if (!match_state || p->state == match_state) { | 1942 | if (!match_state || p->state == match_state) |
1926 | ncsw = p->nivcsw + p->nvcsw; | 1943 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
1927 | if (unlikely(!ncsw)) | ||
1928 | ncsw = 1; | ||
1929 | } | ||
1930 | task_rq_unlock(rq, &flags); | 1944 | task_rq_unlock(rq, &flags); |
1931 | 1945 | ||
1932 | /* | 1946 | /* |
@@ -2286,7 +2300,7 @@ out_running: | |||
2286 | trace_mark(kernel_sched_wakeup, | 2300 | trace_mark(kernel_sched_wakeup, |
2287 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2301 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
2288 | p->pid, p->state, rq, p, rq->curr); | 2302 | p->pid, p->state, rq, p, rq->curr); |
2289 | check_preempt_curr(rq, p); | 2303 | check_preempt_curr(rq, p, sync); |
2290 | 2304 | ||
2291 | p->state = TASK_RUNNING; | 2305 | p->state = TASK_RUNNING; |
2292 | #ifdef CONFIG_SMP | 2306 | #ifdef CONFIG_SMP |
@@ -2421,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2421 | trace_mark(kernel_sched_wakeup_new, | 2435 | trace_mark(kernel_sched_wakeup_new, |
2422 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2436 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
2423 | p->pid, p->state, rq, p, rq->curr); | 2437 | p->pid, p->state, rq, p, rq->curr); |
2424 | check_preempt_curr(rq, p); | 2438 | check_preempt_curr(rq, p, 0); |
2425 | #ifdef CONFIG_SMP | 2439 | #ifdef CONFIG_SMP |
2426 | if (p->sched_class->task_wake_up) | 2440 | if (p->sched_class->task_wake_up) |
2427 | p->sched_class->task_wake_up(rq, p); | 2441 | p->sched_class->task_wake_up(rq, p); |
@@ -2759,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) | |||
2759 | } else { | 2773 | } else { |
2760 | if (rq1 < rq2) { | 2774 | if (rq1 < rq2) { |
2761 | spin_lock(&rq1->lock); | 2775 | spin_lock(&rq1->lock); |
2762 | spin_lock(&rq2->lock); | 2776 | spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); |
2763 | } else { | 2777 | } else { |
2764 | spin_lock(&rq2->lock); | 2778 | spin_lock(&rq2->lock); |
2765 | spin_lock(&rq1->lock); | 2779 | spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); |
2766 | } | 2780 | } |
2767 | } | 2781 | } |
2768 | update_rq_clock(rq1); | 2782 | update_rq_clock(rq1); |
@@ -2805,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2805 | if (busiest < this_rq) { | 2819 | if (busiest < this_rq) { |
2806 | spin_unlock(&this_rq->lock); | 2820 | spin_unlock(&this_rq->lock); |
2807 | spin_lock(&busiest->lock); | 2821 | spin_lock(&busiest->lock); |
2808 | spin_lock(&this_rq->lock); | 2822 | spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); |
2809 | ret = 1; | 2823 | ret = 1; |
2810 | } else | 2824 | } else |
2811 | spin_lock(&busiest->lock); | 2825 | spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); |
2812 | } | 2826 | } |
2813 | return ret; | 2827 | return ret; |
2814 | } | 2828 | } |
2815 | 2829 | ||
2830 | static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
2831 | __releases(busiest->lock) | ||
2832 | { | ||
2833 | spin_unlock(&busiest->lock); | ||
2834 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
2835 | } | ||
2836 | |||
2816 | /* | 2837 | /* |
2817 | * If dest_cpu is allowed for this process, migrate the task to it. | 2838 | * If dest_cpu is allowed for this process, migrate the task to it. |
2818 | * This is accomplished by forcing the cpu_allowed mask to only | 2839 | * This is accomplished by forcing the cpu_allowed mask to only |
@@ -2874,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2874 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2895 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2875 | * to be always true for them. | 2896 | * to be always true for them. |
2876 | */ | 2897 | */ |
2877 | check_preempt_curr(this_rq, p); | 2898 | check_preempt_curr(this_rq, p, 0); |
2878 | } | 2899 | } |
2879 | 2900 | ||
2880 | /* | 2901 | /* |
@@ -3637,7 +3658,7 @@ redo: | |||
3637 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3658 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
3638 | imbalance, sd, CPU_NEWLY_IDLE, | 3659 | imbalance, sd, CPU_NEWLY_IDLE, |
3639 | &all_pinned); | 3660 | &all_pinned); |
3640 | spin_unlock(&busiest->lock); | 3661 | double_unlock_balance(this_rq, busiest); |
3641 | 3662 | ||
3642 | if (unlikely(all_pinned)) { | 3663 | if (unlikely(all_pinned)) { |
3643 | cpu_clear(cpu_of(busiest), *cpus); | 3664 | cpu_clear(cpu_of(busiest), *cpus); |
@@ -3752,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3752 | else | 3773 | else |
3753 | schedstat_inc(sd, alb_failed); | 3774 | schedstat_inc(sd, alb_failed); |
3754 | } | 3775 | } |
3755 | spin_unlock(&target_rq->lock); | 3776 | double_unlock_balance(busiest_rq, target_rq); |
3756 | } | 3777 | } |
3757 | 3778 | ||
3758 | #ifdef CONFIG_NO_HZ | 3779 | #ifdef CONFIG_NO_HZ |
@@ -4173,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
4173 | } | 4194 | } |
4174 | 4195 | ||
4175 | /* | 4196 | /* |
4197 | * Use precise platform statistics if available: | ||
4198 | */ | ||
4199 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
4200 | cputime_t task_utime(struct task_struct *p) | ||
4201 | { | ||
4202 | return p->utime; | ||
4203 | } | ||
4204 | |||
4205 | cputime_t task_stime(struct task_struct *p) | ||
4206 | { | ||
4207 | return p->stime; | ||
4208 | } | ||
4209 | #else | ||
4210 | cputime_t task_utime(struct task_struct *p) | ||
4211 | { | ||
4212 | clock_t utime = cputime_to_clock_t(p->utime), | ||
4213 | total = utime + cputime_to_clock_t(p->stime); | ||
4214 | u64 temp; | ||
4215 | |||
4216 | /* | ||
4217 | * Use CFS's precise accounting: | ||
4218 | */ | ||
4219 | temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); | ||
4220 | |||
4221 | if (total) { | ||
4222 | temp *= utime; | ||
4223 | do_div(temp, total); | ||
4224 | } | ||
4225 | utime = (clock_t)temp; | ||
4226 | |||
4227 | p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); | ||
4228 | return p->prev_utime; | ||
4229 | } | ||
4230 | |||
4231 | cputime_t task_stime(struct task_struct *p) | ||
4232 | { | ||
4233 | clock_t stime; | ||
4234 | |||
4235 | /* | ||
4236 | * Use CFS's precise accounting. (we subtract utime from | ||
4237 | * the total, to make sure the total observed by userspace | ||
4238 | * grows monotonically - apps rely on that): | ||
4239 | */ | ||
4240 | stime = nsec_to_clock_t(p->se.sum_exec_runtime) - | ||
4241 | cputime_to_clock_t(task_utime(p)); | ||
4242 | |||
4243 | if (stime >= 0) | ||
4244 | p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); | ||
4245 | |||
4246 | return p->prev_stime; | ||
4247 | } | ||
4248 | #endif | ||
4249 | |||
4250 | inline cputime_t task_gtime(struct task_struct *p) | ||
4251 | { | ||
4252 | return p->gtime; | ||
4253 | } | ||
4254 | |||
4255 | /* | ||
4176 | * This function gets called by the timer code, with HZ frequency. | 4256 | * This function gets called by the timer code, with HZ frequency. |
4177 | * We call it with interrupts disabled. | 4257 | * We call it with interrupts disabled. |
4178 | * | 4258 | * |
@@ -4562,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | |||
4562 | } | 4642 | } |
4563 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4643 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
4564 | 4644 | ||
4645 | /** | ||
4646 | * complete: - signals a single thread waiting on this completion | ||
4647 | * @x: holds the state of this particular completion | ||
4648 | * | ||
4649 | * This will wake up a single thread waiting on this completion. Threads will be | ||
4650 | * awakened in the same order in which they were queued. | ||
4651 | * | ||
4652 | * See also complete_all(), wait_for_completion() and related routines. | ||
4653 | */ | ||
4565 | void complete(struct completion *x) | 4654 | void complete(struct completion *x) |
4566 | { | 4655 | { |
4567 | unsigned long flags; | 4656 | unsigned long flags; |
@@ -4573,6 +4662,12 @@ void complete(struct completion *x) | |||
4573 | } | 4662 | } |
4574 | EXPORT_SYMBOL(complete); | 4663 | EXPORT_SYMBOL(complete); |
4575 | 4664 | ||
4665 | /** | ||
4666 | * complete_all: - signals all threads waiting on this completion | ||
4667 | * @x: holds the state of this particular completion | ||
4668 | * | ||
4669 | * This will wake up all threads waiting on this particular completion event. | ||
4670 | */ | ||
4576 | void complete_all(struct completion *x) | 4671 | void complete_all(struct completion *x) |
4577 | { | 4672 | { |
4578 | unsigned long flags; | 4673 | unsigned long flags; |
@@ -4593,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4593 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4688 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
4594 | __add_wait_queue_tail(&x->wait, &wait); | 4689 | __add_wait_queue_tail(&x->wait, &wait); |
4595 | do { | 4690 | do { |
4596 | if ((state == TASK_INTERRUPTIBLE && | 4691 | if (signal_pending_state(state, current)) { |
4597 | signal_pending(current)) || | ||
4598 | (state == TASK_KILLABLE && | ||
4599 | fatal_signal_pending(current))) { | ||
4600 | timeout = -ERESTARTSYS; | 4692 | timeout = -ERESTARTSYS; |
4601 | break; | 4693 | break; |
4602 | } | 4694 | } |
@@ -4624,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) | |||
4624 | return timeout; | 4716 | return timeout; |
4625 | } | 4717 | } |
4626 | 4718 | ||
4719 | /** | ||
4720 | * wait_for_completion: - waits for completion of a task | ||
4721 | * @x: holds the state of this particular completion | ||
4722 | * | ||
4723 | * This waits to be signaled for completion of a specific task. It is NOT | ||
4724 | * interruptible and there is no timeout. | ||
4725 | * | ||
4726 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
4727 | * and interrupt capability. Also see complete(). | ||
4728 | */ | ||
4627 | void __sched wait_for_completion(struct completion *x) | 4729 | void __sched wait_for_completion(struct completion *x) |
4628 | { | 4730 | { |
4629 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4731 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
4630 | } | 4732 | } |
4631 | EXPORT_SYMBOL(wait_for_completion); | 4733 | EXPORT_SYMBOL(wait_for_completion); |
4632 | 4734 | ||
4735 | /** | ||
4736 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
4737 | * @x: holds the state of this particular completion | ||
4738 | * @timeout: timeout value in jiffies | ||
4739 | * | ||
4740 | * This waits for either a completion of a specific task to be signaled or for a | ||
4741 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
4742 | * interruptible. | ||
4743 | */ | ||
4633 | unsigned long __sched | 4744 | unsigned long __sched |
4634 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4745 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
4635 | { | 4746 | { |
@@ -4637,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
4637 | } | 4748 | } |
4638 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4749 | EXPORT_SYMBOL(wait_for_completion_timeout); |
4639 | 4750 | ||
4751 | /** | ||
4752 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
4753 | * @x: holds the state of this particular completion | ||
4754 | * | ||
4755 | * This waits for completion of a specific task to be signaled. It is | ||
4756 | * interruptible. | ||
4757 | */ | ||
4640 | int __sched wait_for_completion_interruptible(struct completion *x) | 4758 | int __sched wait_for_completion_interruptible(struct completion *x) |
4641 | { | 4759 | { |
4642 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4760 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
@@ -4646,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) | |||
4646 | } | 4764 | } |
4647 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4765 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
4648 | 4766 | ||
4767 | /** | ||
4768 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
4769 | * @x: holds the state of this particular completion | ||
4770 | * @timeout: timeout value in jiffies | ||
4771 | * | ||
4772 | * This waits for either a completion of a specific task to be signaled or for a | ||
4773 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
4774 | */ | ||
4649 | unsigned long __sched | 4775 | unsigned long __sched |
4650 | wait_for_completion_interruptible_timeout(struct completion *x, | 4776 | wait_for_completion_interruptible_timeout(struct completion *x, |
4651 | unsigned long timeout) | 4777 | unsigned long timeout) |
@@ -4654,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
4654 | } | 4780 | } |
4655 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4781 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4656 | 4782 | ||
4783 | /** | ||
4784 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
4785 | * @x: holds the state of this particular completion | ||
4786 | * | ||
4787 | * This waits to be signaled for completion of a specific task. It can be | ||
4788 | * interrupted by a kill signal. | ||
4789 | */ | ||
4657 | int __sched wait_for_completion_killable(struct completion *x) | 4790 | int __sched wait_for_completion_killable(struct completion *x) |
4658 | { | 4791 | { |
4659 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4792 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
@@ -4663,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x) | |||
4663 | } | 4796 | } |
4664 | EXPORT_SYMBOL(wait_for_completion_killable); | 4797 | EXPORT_SYMBOL(wait_for_completion_killable); |
4665 | 4798 | ||
4799 | /** | ||
4800 | * try_wait_for_completion - try to decrement a completion without blocking | ||
4801 | * @x: completion structure | ||
4802 | * | ||
4803 | * Returns: 0 if a decrement cannot be done without blocking | ||
4804 | * 1 if a decrement succeeded. | ||
4805 | * | ||
4806 | * If a completion is being used as a counting completion, | ||
4807 | * attempt to decrement the counter without blocking. This | ||
4808 | * enables us to avoid waiting if the resource the completion | ||
4809 | * is protecting is not available. | ||
4810 | */ | ||
4811 | bool try_wait_for_completion(struct completion *x) | ||
4812 | { | ||
4813 | int ret = 1; | ||
4814 | |||
4815 | spin_lock_irq(&x->wait.lock); | ||
4816 | if (!x->done) | ||
4817 | ret = 0; | ||
4818 | else | ||
4819 | x->done--; | ||
4820 | spin_unlock_irq(&x->wait.lock); | ||
4821 | return ret; | ||
4822 | } | ||
4823 | EXPORT_SYMBOL(try_wait_for_completion); | ||
4824 | |||
4825 | /** | ||
4826 | * completion_done - Test to see if a completion has any waiters | ||
4827 | * @x: completion structure | ||
4828 | * | ||
4829 | * Returns: 0 if there are waiters (wait_for_completion() in progress) | ||
4830 | * 1 if there are no waiters. | ||
4831 | * | ||
4832 | */ | ||
4833 | bool completion_done(struct completion *x) | ||
4834 | { | ||
4835 | int ret = 1; | ||
4836 | |||
4837 | spin_lock_irq(&x->wait.lock); | ||
4838 | if (!x->done) | ||
4839 | ret = 0; | ||
4840 | spin_unlock_irq(&x->wait.lock); | ||
4841 | return ret; | ||
4842 | } | ||
4843 | EXPORT_SYMBOL(completion_done); | ||
4844 | |||
4666 | static long __sched | 4845 | static long __sched |
4667 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 4846 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
4668 | { | 4847 | { |
@@ -5010,7 +5189,8 @@ recheck: | |||
5010 | * Do not allow realtime tasks into groups that have no runtime | 5189 | * Do not allow realtime tasks into groups that have no runtime |
5011 | * assigned. | 5190 | * assigned. |
5012 | */ | 5191 | */ |
5013 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5192 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5193 | task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
5014 | return -EPERM; | 5194 | return -EPERM; |
5015 | #endif | 5195 | #endif |
5016 | 5196 | ||
@@ -5734,6 +5914,8 @@ static inline void sched_init_granularity(void) | |||
5734 | sysctl_sched_latency = limit; | 5914 | sysctl_sched_latency = limit; |
5735 | 5915 | ||
5736 | sysctl_sched_wakeup_granularity *= factor; | 5916 | sysctl_sched_wakeup_granularity *= factor; |
5917 | |||
5918 | sysctl_sched_shares_ratelimit *= factor; | ||
5737 | } | 5919 | } |
5738 | 5920 | ||
5739 | #ifdef CONFIG_SMP | 5921 | #ifdef CONFIG_SMP |
@@ -5844,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5844 | set_task_cpu(p, dest_cpu); | 6026 | set_task_cpu(p, dest_cpu); |
5845 | if (on_rq) { | 6027 | if (on_rq) { |
5846 | activate_task(rq_dest, p, 0); | 6028 | activate_task(rq_dest, p, 0); |
5847 | check_preempt_curr(rq_dest, p); | 6029 | check_preempt_curr(rq_dest, p, 0); |
5848 | } | 6030 | } |
5849 | done: | 6031 | done: |
5850 | ret = 1; | 6032 | ret = 1; |
@@ -6169,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, | |||
6169 | static struct ctl_table * | 6351 | static struct ctl_table * |
6170 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 6352 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
6171 | { | 6353 | { |
6172 | struct ctl_table *table = sd_alloc_ctl_entry(12); | 6354 | struct ctl_table *table = sd_alloc_ctl_entry(13); |
6173 | 6355 | ||
6174 | if (table == NULL) | 6356 | if (table == NULL) |
6175 | return NULL; | 6357 | return NULL; |
@@ -6197,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
6197 | sizeof(int), 0644, proc_dointvec_minmax); | 6379 | sizeof(int), 0644, proc_dointvec_minmax); |
6198 | set_table_entry(&table[10], "flags", &sd->flags, | 6380 | set_table_entry(&table[10], "flags", &sd->flags, |
6199 | sizeof(int), 0644, proc_dointvec_minmax); | 6381 | sizeof(int), 0644, proc_dointvec_minmax); |
6200 | /* &table[11] is terminator */ | 6382 | set_table_entry(&table[11], "name", sd->name, |
6383 | CORENAME_MAX_SIZE, 0444, proc_dostring); | ||
6384 | /* &table[12] is terminator */ | ||
6201 | 6385 | ||
6202 | return table; | 6386 | return table; |
6203 | } | 6387 | } |
@@ -7081,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7081 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 7265 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
7082 | */ | 7266 | */ |
7083 | 7267 | ||
7268 | #ifdef CONFIG_SCHED_DEBUG | ||
7269 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
7270 | #else | ||
7271 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
7272 | #endif | ||
7273 | |||
7084 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7274 | #define SD_INIT(sd, type) sd_init_##type(sd) |
7275 | |||
7085 | #define SD_INIT_FUNC(type) \ | 7276 | #define SD_INIT_FUNC(type) \ |
7086 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7277 | static noinline void sd_init_##type(struct sched_domain *sd) \ |
7087 | { \ | 7278 | { \ |
7088 | memset(sd, 0, sizeof(*sd)); \ | 7279 | memset(sd, 0, sizeof(*sd)); \ |
7089 | *sd = SD_##type##_INIT; \ | 7280 | *sd = SD_##type##_INIT; \ |
7090 | sd->level = SD_LV_##type; \ | 7281 | sd->level = SD_LV_##type; \ |
7282 | SD_INIT_NAME(sd, type); \ | ||
7091 | } | 7283 | } |
7092 | 7284 | ||
7093 | SD_INIT_FUNC(CPU) | 7285 | SD_INIT_FUNC(CPU) |
@@ -7583,24 +7775,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
7583 | * and partition_sched_domains() will fallback to the single partition | 7775 | * and partition_sched_domains() will fallback to the single partition |
7584 | * 'fallback_doms', it also forces the domains to be rebuilt. | 7776 | * 'fallback_doms', it also forces the domains to be rebuilt. |
7585 | * | 7777 | * |
7778 | * If doms_new==NULL it will be replaced with cpu_online_map. | ||
7779 | * ndoms_new==0 is a special case for destroying existing domains. | ||
7780 | * It will not create the default domain. | ||
7781 | * | ||
7586 | * Call with hotplug lock held | 7782 | * Call with hotplug lock held |
7587 | */ | 7783 | */ |
7588 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, | 7784 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
7589 | struct sched_domain_attr *dattr_new) | 7785 | struct sched_domain_attr *dattr_new) |
7590 | { | 7786 | { |
7591 | int i, j; | 7787 | int i, j, n; |
7592 | 7788 | ||
7593 | mutex_lock(&sched_domains_mutex); | 7789 | mutex_lock(&sched_domains_mutex); |
7594 | 7790 | ||
7595 | /* always unregister in case we don't destroy any domains */ | 7791 | /* always unregister in case we don't destroy any domains */ |
7596 | unregister_sched_domain_sysctl(); | 7792 | unregister_sched_domain_sysctl(); |
7597 | 7793 | ||
7598 | if (doms_new == NULL) | 7794 | n = doms_new ? ndoms_new : 0; |
7599 | ndoms_new = 0; | ||
7600 | 7795 | ||
7601 | /* Destroy deleted domains */ | 7796 | /* Destroy deleted domains */ |
7602 | for (i = 0; i < ndoms_cur; i++) { | 7797 | for (i = 0; i < ndoms_cur; i++) { |
7603 | for (j = 0; j < ndoms_new; j++) { | 7798 | for (j = 0; j < n; j++) { |
7604 | if (cpus_equal(doms_cur[i], doms_new[j]) | 7799 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7605 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7800 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
7606 | goto match1; | 7801 | goto match1; |
@@ -7613,7 +7808,6 @@ match1: | |||
7613 | 7808 | ||
7614 | if (doms_new == NULL) { | 7809 | if (doms_new == NULL) { |
7615 | ndoms_cur = 0; | 7810 | ndoms_cur = 0; |
7616 | ndoms_new = 1; | ||
7617 | doms_new = &fallback_doms; | 7811 | doms_new = &fallback_doms; |
7618 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7812 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
7619 | dattr_new = NULL; | 7813 | dattr_new = NULL; |
@@ -7650,8 +7844,13 @@ match2: | |||
7650 | int arch_reinit_sched_domains(void) | 7844 | int arch_reinit_sched_domains(void) |
7651 | { | 7845 | { |
7652 | get_online_cpus(); | 7846 | get_online_cpus(); |
7847 | |||
7848 | /* Destroy domains first to force the rebuild */ | ||
7849 | partition_sched_domains(0, NULL, NULL); | ||
7850 | |||
7653 | rebuild_sched_domains(); | 7851 | rebuild_sched_domains(); |
7654 | put_online_cpus(); | 7852 | put_online_cpus(); |
7853 | |||
7655 | return 0; | 7854 | return 0; |
7656 | } | 7855 | } |
7657 | 7856 | ||
@@ -7735,7 +7934,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
7735 | case CPU_ONLINE_FROZEN: | 7934 | case CPU_ONLINE_FROZEN: |
7736 | case CPU_DEAD: | 7935 | case CPU_DEAD: |
7737 | case CPU_DEAD_FROZEN: | 7936 | case CPU_DEAD_FROZEN: |
7738 | partition_sched_domains(0, NULL, NULL); | 7937 | partition_sched_domains(1, NULL, NULL); |
7739 | return NOTIFY_OK; | 7938 | return NOTIFY_OK; |
7740 | 7939 | ||
7741 | default: | 7940 | default: |
@@ -8000,7 +8199,6 @@ void __init sched_init(void) | |||
8000 | 8199 | ||
8001 | rq = cpu_rq(i); | 8200 | rq = cpu_rq(i); |
8002 | spin_lock_init(&rq->lock); | 8201 | spin_lock_init(&rq->lock); |
8003 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
8004 | rq->nr_running = 0; | 8202 | rq->nr_running = 0; |
8005 | init_cfs_rq(&rq->cfs, rq); | 8203 | init_cfs_rq(&rq->cfs, rq); |
8006 | init_rt_rq(&rq->rt, rq); | 8204 | init_rt_rq(&rq->rt, rq); |
@@ -8123,20 +8321,25 @@ void __might_sleep(char *file, int line) | |||
8123 | #ifdef in_atomic | 8321 | #ifdef in_atomic |
8124 | static unsigned long prev_jiffy; /* ratelimiting */ | 8322 | static unsigned long prev_jiffy; /* ratelimiting */ |
8125 | 8323 | ||
8126 | if ((in_atomic() || irqs_disabled()) && | 8324 | if ((!in_atomic() && !irqs_disabled()) || |
8127 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 8325 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8128 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8326 | return; |
8129 | return; | 8327 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
8130 | prev_jiffy = jiffies; | 8328 | return; |
8131 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 8329 | prev_jiffy = jiffies; |
8132 | " context at %s:%d\n", file, line); | 8330 | |
8133 | printk("in_atomic():%d, irqs_disabled():%d\n", | 8331 | printk(KERN_ERR |
8134 | in_atomic(), irqs_disabled()); | 8332 | "BUG: sleeping function called from invalid context at %s:%d\n", |
8135 | debug_show_held_locks(current); | 8333 | file, line); |
8136 | if (irqs_disabled()) | 8334 | printk(KERN_ERR |
8137 | print_irqtrace_events(current); | 8335 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
8138 | dump_stack(); | 8336 | in_atomic(), irqs_disabled(), |
8139 | } | 8337 | current->pid, current->comm); |
8338 | |||
8339 | debug_show_held_locks(current); | ||
8340 | if (irqs_disabled()) | ||
8341 | print_irqtrace_events(current); | ||
8342 | dump_stack(); | ||
8140 | #endif | 8343 | #endif |
8141 | } | 8344 | } |
8142 | EXPORT_SYMBOL(__might_sleep); | 8345 | EXPORT_SYMBOL(__might_sleep); |
@@ -8457,8 +8660,8 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8457 | WARN_ON(!parent); /* root should already exist */ | 8660 | WARN_ON(!parent); /* root should already exist */ |
8458 | 8661 | ||
8459 | tg->parent = parent; | 8662 | tg->parent = parent; |
8460 | list_add_rcu(&tg->siblings, &parent->children); | ||
8461 | INIT_LIST_HEAD(&tg->children); | 8663 | INIT_LIST_HEAD(&tg->children); |
8664 | list_add_rcu(&tg->siblings, &parent->children); | ||
8462 | spin_unlock_irqrestore(&task_group_lock, flags); | 8665 | spin_unlock_irqrestore(&task_group_lock, flags); |
8463 | 8666 | ||
8464 | return tg; | 8667 | return tg; |
@@ -8634,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); | |||
8634 | static unsigned long to_ratio(u64 period, u64 runtime) | 8837 | static unsigned long to_ratio(u64 period, u64 runtime) |
8635 | { | 8838 | { |
8636 | if (runtime == RUNTIME_INF) | 8839 | if (runtime == RUNTIME_INF) |
8637 | return 1ULL << 16; | 8840 | return 1ULL << 20; |
8638 | 8841 | ||
8639 | return div64_u64(runtime << 16, period); | 8842 | return div64_u64(runtime << 20, period); |
8640 | } | 8843 | } |
8641 | 8844 | ||
8642 | #ifdef CONFIG_CGROUP_SCHED | 8845 | /* Must be called with tasklist_lock held */ |
8643 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8846 | static inline int tg_has_rt_tasks(struct task_group *tg) |
8644 | { | 8847 | { |
8645 | struct task_group *tgi, *parent = tg->parent; | 8848 | struct task_struct *g, *p; |
8646 | unsigned long total = 0; | ||
8647 | 8849 | ||
8648 | if (!parent) { | 8850 | do_each_thread(g, p) { |
8649 | if (global_rt_period() < period) | 8851 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
8650 | return 0; | 8852 | return 1; |
8853 | } while_each_thread(g, p); | ||
8651 | 8854 | ||
8652 | return to_ratio(period, runtime) < | 8855 | return 0; |
8653 | to_ratio(global_rt_period(), global_rt_runtime()); | 8856 | } |
8654 | } | ||
8655 | 8857 | ||
8656 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | 8858 | struct rt_schedulable_data { |
8657 | return 0; | 8859 | struct task_group *tg; |
8860 | u64 rt_period; | ||
8861 | u64 rt_runtime; | ||
8862 | }; | ||
8658 | 8863 | ||
8659 | rcu_read_lock(); | 8864 | static int tg_schedulable(struct task_group *tg, void *data) |
8660 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | 8865 | { |
8661 | if (tgi == tg) | 8866 | struct rt_schedulable_data *d = data; |
8662 | continue; | 8867 | struct task_group *child; |
8868 | unsigned long total, sum = 0; | ||
8869 | u64 period, runtime; | ||
8870 | |||
8871 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8872 | runtime = tg->rt_bandwidth.rt_runtime; | ||
8663 | 8873 | ||
8664 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8874 | if (tg == d->tg) { |
8665 | tgi->rt_bandwidth.rt_runtime); | 8875 | period = d->rt_period; |
8876 | runtime = d->rt_runtime; | ||
8666 | } | 8877 | } |
8667 | rcu_read_unlock(); | ||
8668 | 8878 | ||
8669 | return total + to_ratio(period, runtime) <= | 8879 | /* |
8670 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8880 | * Cannot have more runtime than the period. |
8671 | parent->rt_bandwidth.rt_runtime); | 8881 | */ |
8672 | } | 8882 | if (runtime > period && runtime != RUNTIME_INF) |
8673 | #elif defined CONFIG_USER_SCHED | 8883 | return -EINVAL; |
8674 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
8675 | { | ||
8676 | struct task_group *tgi; | ||
8677 | unsigned long total = 0; | ||
8678 | unsigned long global_ratio = | ||
8679 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
8680 | 8884 | ||
8681 | rcu_read_lock(); | 8885 | /* |
8682 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8886 | * Ensure we don't starve existing RT tasks. |
8683 | if (tgi == tg) | 8887 | */ |
8684 | continue; | 8888 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
8889 | return -EBUSY; | ||
8685 | 8890 | ||
8686 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8891 | total = to_ratio(period, runtime); |
8687 | tgi->rt_bandwidth.rt_runtime); | 8892 | |
8893 | /* | ||
8894 | * Nobody can have more than the global setting allows. | ||
8895 | */ | ||
8896 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
8897 | return -EINVAL; | ||
8898 | |||
8899 | /* | ||
8900 | * The sum of our children's runtime should not exceed our own. | ||
8901 | */ | ||
8902 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
8903 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
8904 | runtime = child->rt_bandwidth.rt_runtime; | ||
8905 | |||
8906 | if (child == d->tg) { | ||
8907 | period = d->rt_period; | ||
8908 | runtime = d->rt_runtime; | ||
8909 | } | ||
8910 | |||
8911 | sum += to_ratio(period, runtime); | ||
8688 | } | 8912 | } |
8689 | rcu_read_unlock(); | ||
8690 | 8913 | ||
8691 | return total + to_ratio(period, runtime) < global_ratio; | 8914 | if (sum > total) |
8915 | return -EINVAL; | ||
8916 | |||
8917 | return 0; | ||
8692 | } | 8918 | } |
8693 | #endif | ||
8694 | 8919 | ||
8695 | /* Must be called with tasklist_lock held */ | 8920 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8696 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
8697 | { | 8921 | { |
8698 | struct task_struct *g, *p; | 8922 | struct rt_schedulable_data data = { |
8699 | do_each_thread(g, p) { | 8923 | .tg = tg, |
8700 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8924 | .rt_period = period, |
8701 | return 1; | 8925 | .rt_runtime = runtime, |
8702 | } while_each_thread(g, p); | 8926 | }; |
8703 | return 0; | 8927 | |
8928 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | ||
8704 | } | 8929 | } |
8705 | 8930 | ||
8706 | static int tg_set_bandwidth(struct task_group *tg, | 8931 | static int tg_set_bandwidth(struct task_group *tg, |
@@ -8710,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8710 | 8935 | ||
8711 | mutex_lock(&rt_constraints_mutex); | 8936 | mutex_lock(&rt_constraints_mutex); |
8712 | read_lock(&tasklist_lock); | 8937 | read_lock(&tasklist_lock); |
8713 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { | 8938 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
8714 | err = -EBUSY; | 8939 | if (err) |
8715 | goto unlock; | 8940 | goto unlock; |
8716 | } | ||
8717 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | ||
8718 | err = -EINVAL; | ||
8719 | goto unlock; | ||
8720 | } | ||
8721 | 8941 | ||
8722 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8942 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8723 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8943 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
@@ -8786,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg) | |||
8786 | 9006 | ||
8787 | static int sched_rt_global_constraints(void) | 9007 | static int sched_rt_global_constraints(void) |
8788 | { | 9008 | { |
8789 | struct task_group *tg = &root_task_group; | 9009 | u64 runtime, period; |
8790 | u64 rt_runtime, rt_period; | ||
8791 | int ret = 0; | 9010 | int ret = 0; |
8792 | 9011 | ||
8793 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 9012 | if (sysctl_sched_rt_period <= 0) |
8794 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 9013 | return -EINVAL; |
9014 | |||
9015 | runtime = global_rt_runtime(); | ||
9016 | period = global_rt_period(); | ||
9017 | |||
9018 | /* | ||
9019 | * Sanity check on the sysctl variables. | ||
9020 | */ | ||
9021 | if (runtime > period && runtime != RUNTIME_INF) | ||
9022 | return -EINVAL; | ||
8795 | 9023 | ||
8796 | mutex_lock(&rt_constraints_mutex); | 9024 | mutex_lock(&rt_constraints_mutex); |
8797 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) | 9025 | read_lock(&tasklist_lock); |
8798 | ret = -EINVAL; | 9026 | ret = __rt_schedulable(NULL, 0, 0); |
9027 | read_unlock(&tasklist_lock); | ||
8799 | mutex_unlock(&rt_constraints_mutex); | 9028 | mutex_unlock(&rt_constraints_mutex); |
8800 | 9029 | ||
8801 | return ret; | 9030 | return ret; |
@@ -8806,6 +9035,9 @@ static int sched_rt_global_constraints(void) | |||
8806 | unsigned long flags; | 9035 | unsigned long flags; |
8807 | int i; | 9036 | int i; |
8808 | 9037 | ||
9038 | if (sysctl_sched_rt_period <= 0) | ||
9039 | return -EINVAL; | ||
9040 | |||
8809 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 9041 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
8810 | for_each_possible_cpu(i) { | 9042 | for_each_possible_cpu(i) { |
8811 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 9043 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
@@ -8866,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8866 | 9098 | ||
8867 | if (!cgrp->parent) { | 9099 | if (!cgrp->parent) { |
8868 | /* This is early initialization for the top cgroup */ | 9100 | /* This is early initialization for the top cgroup */ |
8869 | init_task_group.css.cgroup = cgrp; | ||
8870 | return &init_task_group.css; | 9101 | return &init_task_group.css; |
8871 | } | 9102 | } |
8872 | 9103 | ||
@@ -8875,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8875 | if (IS_ERR(tg)) | 9106 | if (IS_ERR(tg)) |
8876 | return ERR_PTR(-ENOMEM); | 9107 | return ERR_PTR(-ENOMEM); |
8877 | 9108 | ||
8878 | /* Bind the cgroup to task_group object we just created */ | ||
8879 | tg->css.cgroup = cgrp; | ||
8880 | |||
8881 | return &tg->css; | 9109 | return &tg->css; |
8882 | } | 9110 | } |
8883 | 9111 | ||