diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 317 |
1 files changed, 227 insertions, 90 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eeda..8fe7ee81c552 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -21,6 +21,7 @@ | |||
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Targeted preemption latency for CPU-bound tasks: | 27 | * Targeted preemption latency for CPU-bound tasks: |
@@ -35,12 +36,26 @@ | |||
35 | * run vmstat and monitor the context-switches (cs) field) | 36 | * run vmstat and monitor the context-switches (cs) field) |
36 | */ | 37 | */ |
37 | unsigned int sysctl_sched_latency = 5000000ULL; | 38 | unsigned int sysctl_sched_latency = 5000000ULL; |
39 | unsigned int normalized_sysctl_sched_latency = 5000000ULL; | ||
40 | |||
41 | /* | ||
42 | * The initial- and re-scaling of tunables is configurable | ||
43 | * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) | ||
44 | * | ||
45 | * Options are: | ||
46 | * SCHED_TUNABLESCALING_NONE - unscaled, always *1 | ||
47 | * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) | ||
48 | * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus | ||
49 | */ | ||
50 | enum sched_tunable_scaling sysctl_sched_tunable_scaling | ||
51 | = SCHED_TUNABLESCALING_LOG; | ||
38 | 52 | ||
39 | /* | 53 | /* |
40 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
41 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
42 | */ | 56 | */ |
43 | unsigned int sysctl_sched_min_granularity = 1000000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 1000000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; | ||
44 | 59 | ||
45 | /* | 60 | /* |
46 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
70 | * have immediate wakeup/sleep latencies. | 85 | * have immediate wakeup/sleep latencies. |
71 | */ | 86 | */ |
72 | unsigned int sysctl_sched_wakeup_granularity = 1000000UL; | 87 | unsigned int sysctl_sched_wakeup_granularity = 1000000UL; |
88 | unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | ||
73 | 89 | ||
74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
75 | 91 | ||
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
383 | */ | 399 | */ |
384 | 400 | ||
385 | #ifdef CONFIG_SCHED_DEBUG | 401 | #ifdef CONFIG_SCHED_DEBUG |
386 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 402 | int sched_proc_update_handler(struct ctl_table *table, int write, |
387 | void __user *buffer, size_t *lenp, | 403 | void __user *buffer, size_t *lenp, |
388 | loff_t *ppos) | 404 | loff_t *ppos) |
389 | { | 405 | { |
390 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 406 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
407 | int factor = get_update_sysctl_factor(); | ||
391 | 408 | ||
392 | if (ret || !write) | 409 | if (ret || !write) |
393 | return ret; | 410 | return ret; |
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
395 | sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, | 412 | sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, |
396 | sysctl_sched_min_granularity); | 413 | sysctl_sched_min_granularity); |
397 | 414 | ||
415 | #define WRT_SYSCTL(name) \ | ||
416 | (normalized_sysctl_##name = sysctl_##name / (factor)) | ||
417 | WRT_SYSCTL(sched_min_granularity); | ||
418 | WRT_SYSCTL(sched_latency); | ||
419 | WRT_SYSCTL(sched_wakeup_granularity); | ||
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | ||
422 | |||
398 | return 0; | 423 | return 0; |
399 | } | 424 | } |
400 | #endif | 425 | #endif |
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
485 | curr->sum_exec_runtime += delta_exec; | 510 | curr->sum_exec_runtime += delta_exec; |
486 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 511 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
487 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); | 512 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
513 | |||
488 | curr->vruntime += delta_exec_weighted; | 514 | curr->vruntime += delta_exec_weighted; |
489 | update_min_vruntime(cfs_rq); | 515 | update_min_vruntime(cfs_rq); |
490 | } | 516 | } |
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
740 | se->vruntime = vruntime; | 766 | se->vruntime = vruntime; |
741 | } | 767 | } |
742 | 768 | ||
769 | #define ENQUEUE_WAKEUP 1 | ||
770 | #define ENQUEUE_MIGRATE 2 | ||
771 | |||
743 | static void | 772 | static void |
744 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | 773 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
745 | { | 774 | { |
746 | /* | 775 | /* |
776 | * Update the normalized vruntime before updating min_vruntime | ||
777 | * through callig update_curr(). | ||
778 | */ | ||
779 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) | ||
780 | se->vruntime += cfs_rq->min_vruntime; | ||
781 | |||
782 | /* | ||
747 | * Update run-time statistics of the 'current'. | 783 | * Update run-time statistics of the 'current'. |
748 | */ | 784 | */ |
749 | update_curr(cfs_rq); | 785 | update_curr(cfs_rq); |
750 | account_entity_enqueue(cfs_rq, se); | 786 | account_entity_enqueue(cfs_rq, se); |
751 | 787 | ||
752 | if (wakeup) { | 788 | if (flags & ENQUEUE_WAKEUP) { |
753 | place_entity(cfs_rq, se, 0); | 789 | place_entity(cfs_rq, se, 0); |
754 | enqueue_sleeper(cfs_rq, se); | 790 | enqueue_sleeper(cfs_rq, se); |
755 | } | 791 | } |
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
803 | __dequeue_entity(cfs_rq, se); | 839 | __dequeue_entity(cfs_rq, se); |
804 | account_entity_dequeue(cfs_rq, se); | 840 | account_entity_dequeue(cfs_rq, se); |
805 | update_min_vruntime(cfs_rq); | 841 | update_min_vruntime(cfs_rq); |
842 | |||
843 | /* | ||
844 | * Normalize the entity after updating the min_vruntime because the | ||
845 | * update can refer to the ->curr item and we need to reflect this | ||
846 | * movement in our normalized position. | ||
847 | */ | ||
848 | if (!sleep) | ||
849 | se->vruntime -= cfs_rq->min_vruntime; | ||
806 | } | 850 | } |
807 | 851 | ||
808 | /* | 852 | /* |
@@ -822,6 +866,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
822 | * re-elected due to buddy favours. | 866 | * re-elected due to buddy favours. |
823 | */ | 867 | */ |
824 | clear_buddies(cfs_rq, curr); | 868 | clear_buddies(cfs_rq, curr); |
869 | return; | ||
870 | } | ||
871 | |||
872 | /* | ||
873 | * Ensure that a task that missed wakeup preemption by a | ||
874 | * narrow margin doesn't have to wait for a full slice. | ||
875 | * This also mitigates buddy induced latencies under load. | ||
876 | */ | ||
877 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
878 | return; | ||
879 | |||
880 | if (delta_exec < sysctl_sched_min_granularity) | ||
881 | return; | ||
882 | |||
883 | if (cfs_rq->nr_running > 1) { | ||
884 | struct sched_entity *se = __pick_next_entity(cfs_rq); | ||
885 | s64 delta = curr->vruntime - se->vruntime; | ||
886 | |||
887 | if (delta > ideal_runtime) | ||
888 | resched_task(rq_of(cfs_rq)->curr); | ||
825 | } | 889 | } |
826 | } | 890 | } |
827 | 891 | ||
@@ -861,12 +925,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | |||
861 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 925 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
862 | { | 926 | { |
863 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 927 | struct sched_entity *se = __pick_next_entity(cfs_rq); |
928 | struct sched_entity *left = se; | ||
864 | 929 | ||
865 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) | 930 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) |
866 | return cfs_rq->next; | 931 | se = cfs_rq->next; |
867 | 932 | ||
868 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) | 933 | /* |
869 | return cfs_rq->last; | 934 | * Prefer last buddy, try to return the CPU to a preempted task. |
935 | */ | ||
936 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | ||
937 | se = cfs_rq->last; | ||
938 | |||
939 | clear_buddies(cfs_rq, se); | ||
870 | 940 | ||
871 | return se; | 941 | return se; |
872 | } | 942 | } |
@@ -987,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
987 | { | 1057 | { |
988 | struct cfs_rq *cfs_rq; | 1058 | struct cfs_rq *cfs_rq; |
989 | struct sched_entity *se = &p->se; | 1059 | struct sched_entity *se = &p->se; |
1060 | int flags = 0; | ||
1061 | |||
1062 | if (wakeup) | ||
1063 | flags |= ENQUEUE_WAKEUP; | ||
1064 | if (p->state == TASK_WAKING) | ||
1065 | flags |= ENQUEUE_MIGRATE; | ||
990 | 1066 | ||
991 | for_each_sched_entity(se) { | 1067 | for_each_sched_entity(se) { |
992 | if (se->on_rq) | 1068 | if (se->on_rq) |
993 | break; | 1069 | break; |
994 | cfs_rq = cfs_rq_of(se); | 1070 | cfs_rq = cfs_rq_of(se); |
995 | enqueue_entity(cfs_rq, se, wakeup); | 1071 | enqueue_entity(cfs_rq, se, flags); |
996 | wakeup = 1; | 1072 | flags = ENQUEUE_WAKEUP; |
997 | } | 1073 | } |
998 | 1074 | ||
999 | hrtick_update(rq); | 1075 | hrtick_update(rq); |
@@ -1069,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) | |||
1069 | 1145 | ||
1070 | #ifdef CONFIG_SMP | 1146 | #ifdef CONFIG_SMP |
1071 | 1147 | ||
1148 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | ||
1149 | { | ||
1150 | struct sched_entity *se = &p->se; | ||
1151 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1152 | |||
1153 | se->vruntime -= cfs_rq->min_vruntime; | ||
1154 | } | ||
1155 | |||
1072 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1156 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1073 | /* | 1157 | /* |
1074 | * effective_load() calculates the load change as seen from the root_task_group | 1158 | * effective_load() calculates the load change as seen from the root_task_group |
@@ -1319,6 +1403,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1319 | } | 1403 | } |
1320 | 1404 | ||
1321 | /* | 1405 | /* |
1406 | * Try and locate an idle CPU in the sched_domain. | ||
1407 | */ | ||
1408 | static int | ||
1409 | select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) | ||
1410 | { | ||
1411 | int cpu = smp_processor_id(); | ||
1412 | int prev_cpu = task_cpu(p); | ||
1413 | int i; | ||
1414 | |||
1415 | /* | ||
1416 | * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE | ||
1417 | * test in select_task_rq_fair) and the prev_cpu is idle then that's | ||
1418 | * always a better target than the current cpu. | ||
1419 | */ | ||
1420 | if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) | ||
1421 | return prev_cpu; | ||
1422 | |||
1423 | /* | ||
1424 | * Otherwise, iterate the domain and find an elegible idle cpu. | ||
1425 | */ | ||
1426 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | ||
1427 | if (!cpu_rq(i)->cfs.nr_running) { | ||
1428 | target = i; | ||
1429 | break; | ||
1430 | } | ||
1431 | } | ||
1432 | |||
1433 | return target; | ||
1434 | } | ||
1435 | |||
1436 | /* | ||
1322 | * sched_balance_self: balance the current task (running on cpu) in domains | 1437 | * sched_balance_self: balance the current task (running on cpu) in domains |
1323 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | 1438 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and |
1324 | * SD_BALANCE_EXEC. | 1439 | * SD_BALANCE_EXEC. |
@@ -1346,8 +1461,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1346 | new_cpu = prev_cpu; | 1461 | new_cpu = prev_cpu; |
1347 | } | 1462 | } |
1348 | 1463 | ||
1349 | rcu_read_lock(); | ||
1350 | for_each_domain(cpu, tmp) { | 1464 | for_each_domain(cpu, tmp) { |
1465 | if (!(tmp->flags & SD_LOAD_BALANCE)) | ||
1466 | continue; | ||
1467 | |||
1351 | /* | 1468 | /* |
1352 | * If power savings logic is enabled for a domain, see if we | 1469 | * If power savings logic is enabled for a domain, see if we |
1353 | * are not overloaded, if so, don't balance wider. | 1470 | * are not overloaded, if so, don't balance wider. |
@@ -1372,11 +1489,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1372 | want_sd = 0; | 1489 | want_sd = 0; |
1373 | } | 1490 | } |
1374 | 1491 | ||
1375 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 1492 | /* |
1376 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { | 1493 | * While iterating the domains looking for a spanning |
1494 | * WAKE_AFFINE domain, adjust the affine target to any idle cpu | ||
1495 | * in cache sharing domains along the way. | ||
1496 | */ | ||
1497 | if (want_affine) { | ||
1498 | int target = -1; | ||
1377 | 1499 | ||
1378 | affine_sd = tmp; | 1500 | /* |
1379 | want_affine = 0; | 1501 | * If both cpu and prev_cpu are part of this domain, |
1502 | * cpu is a valid SD_WAKE_AFFINE target. | ||
1503 | */ | ||
1504 | if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) | ||
1505 | target = cpu; | ||
1506 | |||
1507 | /* | ||
1508 | * If there's an idle sibling in this domain, make that | ||
1509 | * the wake_affine target instead of the current cpu. | ||
1510 | */ | ||
1511 | if (tmp->flags & SD_SHARE_PKG_RESOURCES) | ||
1512 | target = select_idle_sibling(p, tmp, target); | ||
1513 | |||
1514 | if (target >= 0) { | ||
1515 | if (tmp->flags & SD_WAKE_AFFINE) { | ||
1516 | affine_sd = tmp; | ||
1517 | want_affine = 0; | ||
1518 | } | ||
1519 | cpu = target; | ||
1520 | } | ||
1380 | } | 1521 | } |
1381 | 1522 | ||
1382 | if (!want_sd && !want_affine) | 1523 | if (!want_sd && !want_affine) |
@@ -1403,10 +1544,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1403 | update_shares(tmp); | 1544 | update_shares(tmp); |
1404 | } | 1545 | } |
1405 | 1546 | ||
1406 | if (affine_sd && wake_affine(affine_sd, p, sync)) { | 1547 | if (affine_sd && wake_affine(affine_sd, p, sync)) |
1407 | new_cpu = cpu; | 1548 | return cpu; |
1408 | goto out; | ||
1409 | } | ||
1410 | 1549 | ||
1411 | while (sd) { | 1550 | while (sd) { |
1412 | int load_idx = sd->forkexec_idx; | 1551 | int load_idx = sd->forkexec_idx; |
@@ -1447,8 +1586,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1447 | /* while loop will break here if sd == NULL */ | 1586 | /* while loop will break here if sd == NULL */ |
1448 | } | 1587 | } |
1449 | 1588 | ||
1450 | out: | ||
1451 | rcu_read_unlock(); | ||
1452 | return new_cpu; | 1589 | return new_cpu; |
1453 | } | 1590 | } |
1454 | #endif /* CONFIG_SMP */ | 1591 | #endif /* CONFIG_SMP */ |
@@ -1568,13 +1705,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1568 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1705 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1569 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1706 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1570 | int sync = wake_flags & WF_SYNC; | 1707 | int sync = wake_flags & WF_SYNC; |
1708 | int scale = cfs_rq->nr_running >= sched_nr_latency; | ||
1571 | 1709 | ||
1572 | update_curr(cfs_rq); | 1710 | if (unlikely(rt_prio(p->prio))) |
1573 | 1711 | goto preempt; | |
1574 | if (unlikely(rt_prio(p->prio))) { | ||
1575 | resched_task(curr); | ||
1576 | return; | ||
1577 | } | ||
1578 | 1712 | ||
1579 | if (unlikely(p->sched_class != &fair_sched_class)) | 1713 | if (unlikely(p->sched_class != &fair_sched_class)) |
1580 | return; | 1714 | return; |
@@ -1582,18 +1716,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1582 | if (unlikely(se == pse)) | 1716 | if (unlikely(se == pse)) |
1583 | return; | 1717 | return; |
1584 | 1718 | ||
1585 | /* | 1719 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) |
1586 | * Only set the backward buddy when the current task is still on the | ||
1587 | * rq. This can happen when a wakeup gets interleaved with schedule on | ||
1588 | * the ->pre_schedule() or idle_balance() point, either of which can | ||
1589 | * drop the rq lock. | ||
1590 | * | ||
1591 | * Also, during early boot the idle thread is in the fair class, for | ||
1592 | * obvious reasons its a bad idea to schedule back to the idle thread. | ||
1593 | */ | ||
1594 | if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) | ||
1595 | set_last_buddy(se); | ||
1596 | if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) | ||
1597 | set_next_buddy(pse); | 1720 | set_next_buddy(pse); |
1598 | 1721 | ||
1599 | /* | 1722 | /* |
@@ -1611,36 +1734,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1611 | return; | 1734 | return; |
1612 | 1735 | ||
1613 | /* Idle tasks are by definition preempted by everybody. */ | 1736 | /* Idle tasks are by definition preempted by everybody. */ |
1614 | if (unlikely(curr->policy == SCHED_IDLE)) { | 1737 | if (unlikely(curr->policy == SCHED_IDLE)) |
1615 | resched_task(curr); | 1738 | goto preempt; |
1616 | return; | ||
1617 | } | ||
1618 | 1739 | ||
1619 | if ((sched_feat(WAKEUP_SYNC) && sync) || | 1740 | if (sched_feat(WAKEUP_SYNC) && sync) |
1620 | (sched_feat(WAKEUP_OVERLAP) && | 1741 | goto preempt; |
1621 | (se->avg_overlap < sysctl_sched_migration_cost && | ||
1622 | pse->avg_overlap < sysctl_sched_migration_cost))) { | ||
1623 | resched_task(curr); | ||
1624 | return; | ||
1625 | } | ||
1626 | 1742 | ||
1627 | if (sched_feat(WAKEUP_RUNNING)) { | 1743 | if (sched_feat(WAKEUP_OVERLAP) && |
1628 | if (pse->avg_running < se->avg_running) { | 1744 | se->avg_overlap < sysctl_sched_migration_cost && |
1629 | set_next_buddy(pse); | 1745 | pse->avg_overlap < sysctl_sched_migration_cost) |
1630 | resched_task(curr); | 1746 | goto preempt; |
1631 | return; | ||
1632 | } | ||
1633 | } | ||
1634 | 1747 | ||
1635 | if (!sched_feat(WAKEUP_PREEMPT)) | 1748 | if (!sched_feat(WAKEUP_PREEMPT)) |
1636 | return; | 1749 | return; |
1637 | 1750 | ||
1751 | update_curr(cfs_rq); | ||
1638 | find_matching_se(&se, &pse); | 1752 | find_matching_se(&se, &pse); |
1639 | |||
1640 | BUG_ON(!pse); | 1753 | BUG_ON(!pse); |
1641 | |||
1642 | if (wakeup_preempt_entity(se, pse) == 1) | 1754 | if (wakeup_preempt_entity(se, pse) == 1) |
1643 | resched_task(curr); | 1755 | goto preempt; |
1756 | |||
1757 | return; | ||
1758 | |||
1759 | preempt: | ||
1760 | resched_task(curr); | ||
1761 | /* | ||
1762 | * Only set the backward buddy when the current task is still | ||
1763 | * on the rq. This can happen when a wakeup gets interleaved | ||
1764 | * with schedule on the ->pre_schedule() or idle_balance() | ||
1765 | * point, either of which can * drop the rq lock. | ||
1766 | * | ||
1767 | * Also, during early boot the idle thread is in the fair class, | ||
1768 | * for obvious reasons its a bad idea to schedule back to it. | ||
1769 | */ | ||
1770 | if (unlikely(!se->on_rq || curr == rq->idle)) | ||
1771 | return; | ||
1772 | |||
1773 | if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) | ||
1774 | set_last_buddy(se); | ||
1644 | } | 1775 | } |
1645 | 1776 | ||
1646 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1777 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
@@ -1649,21 +1780,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
1649 | struct cfs_rq *cfs_rq = &rq->cfs; | 1780 | struct cfs_rq *cfs_rq = &rq->cfs; |
1650 | struct sched_entity *se; | 1781 | struct sched_entity *se; |
1651 | 1782 | ||
1652 | if (unlikely(!cfs_rq->nr_running)) | 1783 | if (!cfs_rq->nr_running) |
1653 | return NULL; | 1784 | return NULL; |
1654 | 1785 | ||
1655 | do { | 1786 | do { |
1656 | se = pick_next_entity(cfs_rq); | 1787 | se = pick_next_entity(cfs_rq); |
1657 | /* | ||
1658 | * If se was a buddy, clear it so that it will have to earn | ||
1659 | * the favour again. | ||
1660 | * | ||
1661 | * If se was not a buddy, clear the buddies because neither | ||
1662 | * was elegible to run, let them earn it again. | ||
1663 | * | ||
1664 | * IOW. unconditionally clear buddies. | ||
1665 | */ | ||
1666 | __clear_buddies(cfs_rq, NULL); | ||
1667 | set_next_entity(cfs_rq, se); | 1788 | set_next_entity(cfs_rq, se); |
1668 | cfs_rq = group_cfs_rq(se); | 1789 | cfs_rq = group_cfs_rq(se); |
1669 | } while (cfs_rq); | 1790 | } while (cfs_rq); |
@@ -1830,6 +1951,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1830 | 1951 | ||
1831 | return 0; | 1952 | return 0; |
1832 | } | 1953 | } |
1954 | |||
1955 | static void rq_online_fair(struct rq *rq) | ||
1956 | { | ||
1957 | update_sysctl(); | ||
1958 | } | ||
1959 | |||
1960 | static void rq_offline_fair(struct rq *rq) | ||
1961 | { | ||
1962 | update_sysctl(); | ||
1963 | } | ||
1964 | |||
1833 | #endif /* CONFIG_SMP */ | 1965 | #endif /* CONFIG_SMP */ |
1834 | 1966 | ||
1835 | /* | 1967 | /* |
@@ -1847,28 +1979,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
1847 | } | 1979 | } |
1848 | 1980 | ||
1849 | /* | 1981 | /* |
1850 | * Share the fairness runtime between parent and child, thus the | 1982 | * called on fork with the child task as argument from the parent's context |
1851 | * total amount of pressure for CPU stays equal - new tasks | 1983 | * - child not yet on the tasklist |
1852 | * get a chance to run but frequent forkers are not allowed to | 1984 | * - preemption disabled |
1853 | * monopolize the CPU. Note: the parent runqueue is locked, | ||
1854 | * the child is not running yet. | ||
1855 | */ | 1985 | */ |
1856 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1986 | static void task_fork_fair(struct task_struct *p) |
1857 | { | 1987 | { |
1858 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1988 | struct cfs_rq *cfs_rq = task_cfs_rq(current); |
1859 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; | 1989 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; |
1860 | int this_cpu = smp_processor_id(); | 1990 | int this_cpu = smp_processor_id(); |
1991 | struct rq *rq = this_rq(); | ||
1992 | unsigned long flags; | ||
1993 | |||
1994 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1861 | 1995 | ||
1862 | sched_info_queued(p); | 1996 | if (unlikely(task_cpu(p) != this_cpu)) |
1997 | __set_task_cpu(p, this_cpu); | ||
1863 | 1998 | ||
1864 | update_curr(cfs_rq); | 1999 | update_curr(cfs_rq); |
2000 | |||
1865 | if (curr) | 2001 | if (curr) |
1866 | se->vruntime = curr->vruntime; | 2002 | se->vruntime = curr->vruntime; |
1867 | place_entity(cfs_rq, se, 1); | 2003 | place_entity(cfs_rq, se, 1); |
1868 | 2004 | ||
1869 | /* 'curr' will be NULL if the child belongs to a different group */ | 2005 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { |
1870 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && | ||
1871 | curr && entity_before(curr, se)) { | ||
1872 | /* | 2006 | /* |
1873 | * Upon rescheduling, sched_class::put_prev_task() will place | 2007 | * Upon rescheduling, sched_class::put_prev_task() will place |
1874 | * 'current' within the tree based on its new key value. | 2008 | * 'current' within the tree based on its new key value. |
@@ -1877,7 +2011,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1877 | resched_task(rq->curr); | 2011 | resched_task(rq->curr); |
1878 | } | 2012 | } |
1879 | 2013 | ||
1880 | enqueue_task_fair(rq, p, 0); | 2014 | se->vruntime -= cfs_rq->min_vruntime; |
2015 | |||
2016 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1881 | } | 2017 | } |
1882 | 2018 | ||
1883 | /* | 2019 | /* |
@@ -1930,30 +2066,27 @@ static void set_curr_task_fair(struct rq *rq) | |||
1930 | } | 2066 | } |
1931 | 2067 | ||
1932 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2068 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1933 | static void moved_group_fair(struct task_struct *p) | 2069 | static void moved_group_fair(struct task_struct *p, int on_rq) |
1934 | { | 2070 | { |
1935 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 2071 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1936 | 2072 | ||
1937 | update_curr(cfs_rq); | 2073 | update_curr(cfs_rq); |
1938 | place_entity(cfs_rq, &p->se, 1); | 2074 | if (!on_rq) |
2075 | place_entity(cfs_rq, &p->se, 1); | ||
1939 | } | 2076 | } |
1940 | #endif | 2077 | #endif |
1941 | 2078 | ||
1942 | unsigned int get_rr_interval_fair(struct task_struct *task) | 2079 | unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
1943 | { | 2080 | { |
1944 | struct sched_entity *se = &task->se; | 2081 | struct sched_entity *se = &task->se; |
1945 | unsigned long flags; | ||
1946 | struct rq *rq; | ||
1947 | unsigned int rr_interval = 0; | 2082 | unsigned int rr_interval = 0; |
1948 | 2083 | ||
1949 | /* | 2084 | /* |
1950 | * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise | 2085 | * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise |
1951 | * idle runqueue: | 2086 | * idle runqueue: |
1952 | */ | 2087 | */ |
1953 | rq = task_rq_lock(task, &flags); | ||
1954 | if (rq->cfs.load.weight) | 2088 | if (rq->cfs.load.weight) |
1955 | rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | 2089 | rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); |
1956 | task_rq_unlock(rq, &flags); | ||
1957 | 2090 | ||
1958 | return rr_interval; | 2091 | return rr_interval; |
1959 | } | 2092 | } |
@@ -1977,11 +2110,15 @@ static const struct sched_class fair_sched_class = { | |||
1977 | 2110 | ||
1978 | .load_balance = load_balance_fair, | 2111 | .load_balance = load_balance_fair, |
1979 | .move_one_task = move_one_task_fair, | 2112 | .move_one_task = move_one_task_fair, |
2113 | .rq_online = rq_online_fair, | ||
2114 | .rq_offline = rq_offline_fair, | ||
2115 | |||
2116 | .task_waking = task_waking_fair, | ||
1980 | #endif | 2117 | #endif |
1981 | 2118 | ||
1982 | .set_curr_task = set_curr_task_fair, | 2119 | .set_curr_task = set_curr_task_fair, |
1983 | .task_tick = task_tick_fair, | 2120 | .task_tick = task_tick_fair, |
1984 | .task_new = task_new_fair, | 2121 | .task_fork = task_fork_fair, |
1985 | 2122 | ||
1986 | .prio_changed = prio_changed_fair, | 2123 | .prio_changed = prio_changed_fair, |
1987 | .switched_to = switched_to_fair, | 2124 | .switched_to = switched_to_fair, |