aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c268
1 files changed, 182 insertions, 86 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d591..5bedf6e3ebf3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -822,6 +847,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
822 * re-elected due to buddy favours. 847 * re-elected due to buddy favours.
823 */ 848 */
824 clear_buddies(cfs_rq, curr); 849 clear_buddies(cfs_rq, curr);
850 return;
851 }
852
853 /*
854 * Ensure that a task that missed wakeup preemption by a
855 * narrow margin doesn't have to wait for a full slice.
856 * This also mitigates buddy induced latencies under load.
857 */
858 if (!sched_feat(WAKEUP_PREEMPT))
859 return;
860
861 if (delta_exec < sysctl_sched_min_granularity)
862 return;
863
864 if (cfs_rq->nr_running > 1) {
865 struct sched_entity *se = __pick_next_entity(cfs_rq);
866 s64 delta = curr->vruntime - se->vruntime;
867
868 if (delta > ideal_runtime)
869 resched_task(rq_of(cfs_rq)->curr);
825 } 870 }
826} 871}
827 872
@@ -861,12 +906,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
861static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 906static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
862{ 907{
863 struct sched_entity *se = __pick_next_entity(cfs_rq); 908 struct sched_entity *se = __pick_next_entity(cfs_rq);
909 struct sched_entity *left = se;
864 910
865 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) 911 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866 return cfs_rq->next; 912 se = cfs_rq->next;
867 913
868 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) 914 /*
869 return cfs_rq->last; 915 * Prefer last buddy, try to return the CPU to a preempted task.
916 */
917 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
918 se = cfs_rq->last;
919
920 clear_buddies(cfs_rq, se);
870 921
871 return se; 922 return se;
872} 923}
@@ -1319,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1319} 1370}
1320 1371
1321/* 1372/*
1373 * Try and locate an idle CPU in the sched_domain.
1374 */
1375static int
1376select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1377{
1378 int cpu = smp_processor_id();
1379 int prev_cpu = task_cpu(p);
1380 int i;
1381
1382 /*
1383 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1384 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1385 * always a better target than the current cpu.
1386 */
1387 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1388 return prev_cpu;
1389
1390 /*
1391 * Otherwise, iterate the domain and find an elegible idle cpu.
1392 */
1393 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1394 if (!cpu_rq(i)->cfs.nr_running) {
1395 target = i;
1396 break;
1397 }
1398 }
1399
1400 return target;
1401}
1402
1403/*
1322 * sched_balance_self: balance the current task (running on cpu) in domains 1404 * sched_balance_self: balance the current task (running on cpu) in domains
1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1405 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1324 * SD_BALANCE_EXEC. 1406 * SD_BALANCE_EXEC.
@@ -1346,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1346 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1347 } 1429 }
1348 1430
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1351 /* 1432 /*
1352 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1372,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 want_sd = 0; 1453 want_sd = 0;
1373 } 1454 }
1374 1455
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1456 /*
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1457 * While iterating the domains looking for a spanning
1458 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1459 * in cache sharing domains along the way.
1460 */
1461 if (want_affine) {
1462 int target = -1;
1377 1463
1378 affine_sd = tmp; 1464 /*
1379 want_affine = 0; 1465 * If both cpu and prev_cpu are part of this domain,
1466 * cpu is a valid SD_WAKE_AFFINE target.
1467 */
1468 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1469 target = cpu;
1470
1471 /*
1472 * If there's an idle sibling in this domain, make that
1473 * the wake_affine target instead of the current cpu.
1474 */
1475 if (tmp->flags & SD_PREFER_SIBLING)
1476 target = select_idle_sibling(p, tmp, target);
1477
1478 if (target >= 0) {
1479 if (tmp->flags & SD_WAKE_AFFINE) {
1480 affine_sd = tmp;
1481 want_affine = 0;
1482 }
1483 cpu = target;
1484 }
1380 } 1485 }
1381 1486
1382 if (!want_sd && !want_affine) 1487 if (!want_sd && !want_affine)
@@ -1403,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 update_shares(tmp); 1508 update_shares(tmp);
1404 } 1509 }
1405 1510
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1407 new_cpu = cpu; 1512 return cpu;
1408 goto out;
1409 }
1410 1513
1411 while (sd) { 1514 while (sd) {
1412 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1447,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1447 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1448 } 1551 }
1449 1552
1450out:
1451 rcu_read_unlock();
1452 return new_cpu; 1553 return new_cpu;
1453} 1554}
1454#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1568,13 +1669,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1568 struct sched_entity *se = &curr->se, *pse = &p->se; 1669 struct sched_entity *se = &curr->se, *pse = &p->se;
1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1670 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1571 1673
1572 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1573 1675 goto preempt;
1574 if (unlikely(rt_prio(p->prio))) {
1575 resched_task(curr);
1576 return;
1577 }
1578 1676
1579 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1580 return; 1678 return;
@@ -1582,18 +1680,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1582 if (unlikely(se == pse)) 1680 if (unlikely(se == pse))
1583 return; 1681 return;
1584 1682
1585 /* 1683 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586 * Only set the backward buddy when the current task is still on the
1587 * rq. This can happen when a wakeup gets interleaved with schedule on
1588 * the ->pre_schedule() or idle_balance() point, either of which can
1589 * drop the rq lock.
1590 *
1591 * Also, during early boot the idle thread is in the fair class, for
1592 * obvious reasons its a bad idea to schedule back to the idle thread.
1593 */
1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595 set_last_buddy(se);
1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse); 1684 set_next_buddy(pse);
1598 1685
1599 /* 1686 /*
@@ -1611,36 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1611 return; 1698 return;
1612 1699
1613 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1614 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1615 resched_task(curr); 1702 goto preempt;
1616 return;
1617 }
1618 1703
1619 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1620 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1623 resched_task(curr);
1624 return;
1625 }
1626 1706
1627 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1628 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1629 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1630 resched_task(curr); 1710 goto preempt;
1631 return;
1632 }
1633 }
1634 1711
1635 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1636 return; 1713 return;
1637 1714
1715 update_curr(cfs_rq);
1638 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1639
1640 BUG_ON(!pse); 1717 BUG_ON(!pse);
1641
1642 if (wakeup_preempt_entity(se, pse) == 1) 1718 if (wakeup_preempt_entity(se, pse) == 1)
1643 resched_task(curr); 1719 goto preempt;
1720
1721 return;
1722
1723preempt:
1724 resched_task(curr);
1725 /*
1726 * Only set the backward buddy when the current task is still
1727 * on the rq. This can happen when a wakeup gets interleaved
1728 * with schedule on the ->pre_schedule() or idle_balance()
1729 * point, either of which can * drop the rq lock.
1730 *
1731 * Also, during early boot the idle thread is in the fair class,
1732 * for obvious reasons its a bad idea to schedule back to it.
1733 */
1734 if (unlikely(!se->on_rq || curr == rq->idle))
1735 return;
1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1644} 1739}
1645 1740
1646static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1649,21 +1744,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1649 struct cfs_rq *cfs_rq = &rq->cfs; 1744 struct cfs_rq *cfs_rq = &rq->cfs;
1650 struct sched_entity *se; 1745 struct sched_entity *se;
1651 1746
1652 if (unlikely(!cfs_rq->nr_running)) 1747 if (!cfs_rq->nr_running)
1653 return NULL; 1748 return NULL;
1654 1749
1655 do { 1750 do {
1656 se = pick_next_entity(cfs_rq); 1751 se = pick_next_entity(cfs_rq);
1657 /*
1658 * If se was a buddy, clear it so that it will have to earn
1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1665 */
1666 __clear_buddies(cfs_rq, NULL);
1667 set_next_entity(cfs_rq, se); 1752 set_next_entity(cfs_rq, se);
1668 cfs_rq = group_cfs_rq(se); 1753 cfs_rq = group_cfs_rq(se);
1669 } while (cfs_rq); 1754 } while (cfs_rq);
@@ -1830,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1830 1915
1831 return 0; 1916 return 0;
1832} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1833#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1834 1930
1835/* 1931/*
@@ -1847,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1847} 1943}
1848 1944
1849/* 1945/*
1850 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1851 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1852 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1853 * monopolize the CPU. Note: the parent runqueue is locked,
1854 * the child is not running yet.
1855 */ 1949 */
1856static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1857{ 1951{
1858 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1859 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1860 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 raw_spin_lock_irqsave(&rq->lock, flags);
1861 1959
1862 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1863 1962
1864 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1865 if (curr) 1965 if (curr)
1866 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1867 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1868 1968
1869 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1870 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1871 curr && entity_before(curr, se)) {
1872 /* 1970 /*
1873 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1874 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1877,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1877 resched_task(rq->curr); 1975 resched_task(rq->curr);
1878 } 1976 }
1879 1977
1880 enqueue_task_fair(rq, p, 0); 1978 raw_spin_unlock_irqrestore(&rq->lock, flags);
1881} 1979}
1882 1980
1883/* 1981/*
@@ -1939,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1939} 2037}
1940#endif 2038#endif
1941 2039
1942unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1943{ 2041{
1944 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
1948 2044
1949 /* 2045 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue: 2047 * idle runqueue:
1952 */ 2048 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957 2051
1958 return rr_interval; 2052 return rr_interval;
1959} 2053}
@@ -1977,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1977 2071
1978 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
1979 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
1980#endif 2076#endif
1981 2077
1982 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
1983 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
1984 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
1985 2081
1986 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
1987 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,