aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c220
1 files changed, 148 insertions, 72 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..5bedf6e3ebf3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -1345,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1345} 1370}
1346 1371
1347/* 1372/*
1373 * Try and locate an idle CPU in the sched_domain.
1374 */
1375static int
1376select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1377{
1378 int cpu = smp_processor_id();
1379 int prev_cpu = task_cpu(p);
1380 int i;
1381
1382 /*
1383 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1384 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1385 * always a better target than the current cpu.
1386 */
1387 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1388 return prev_cpu;
1389
1390 /*
1391 * Otherwise, iterate the domain and find an elegible idle cpu.
1392 */
1393 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1394 if (!cpu_rq(i)->cfs.nr_running) {
1395 target = i;
1396 break;
1397 }
1398 }
1399
1400 return target;
1401}
1402
1403/*
1348 * sched_balance_self: balance the current task (running on cpu) in domains 1404 * sched_balance_self: balance the current task (running on cpu) in domains
1349 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1405 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350 * SD_BALANCE_EXEC. 1406 * SD_BALANCE_EXEC.
@@ -1372,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1373 } 1429 }
1374 1430
1375 rcu_read_lock();
1376 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1377 /* 1432 /*
1378 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1398,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398 want_sd = 0; 1453 want_sd = 0;
1399 } 1454 }
1400 1455
1401 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1456 /*
1402 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1457 * While iterating the domains looking for a spanning
1458 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1459 * in cache sharing domains along the way.
1460 */
1461 if (want_affine) {
1462 int target = -1;
1463
1464 /*
1465 * If both cpu and prev_cpu are part of this domain,
1466 * cpu is a valid SD_WAKE_AFFINE target.
1467 */
1468 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1469 target = cpu;
1403 1470
1404 affine_sd = tmp; 1471 /*
1405 want_affine = 0; 1472 * If there's an idle sibling in this domain, make that
1473 * the wake_affine target instead of the current cpu.
1474 */
1475 if (tmp->flags & SD_PREFER_SIBLING)
1476 target = select_idle_sibling(p, tmp, target);
1477
1478 if (target >= 0) {
1479 if (tmp->flags & SD_WAKE_AFFINE) {
1480 affine_sd = tmp;
1481 want_affine = 0;
1482 }
1483 cpu = target;
1484 }
1406 } 1485 }
1407 1486
1408 if (!want_sd && !want_affine) 1487 if (!want_sd && !want_affine)
@@ -1429,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429 update_shares(tmp); 1508 update_shares(tmp);
1430 } 1509 }
1431 1510
1432 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1433 new_cpu = cpu; 1512 return cpu;
1434 goto out;
1435 }
1436 1513
1437 while (sd) { 1514 while (sd) {
1438 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1473,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1473 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1474 } 1551 }
1475 1552
1476out:
1477 rcu_read_unlock();
1478 return new_cpu; 1553 return new_cpu;
1479} 1554}
1480#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1596,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1596 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1597 int scale = cfs_rq->nr_running >= sched_nr_latency; 1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1598 1673
1599 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1600 1675 goto preempt;
1601 if (unlikely(rt_prio(p->prio))) {
1602 resched_task(curr);
1603 return;
1604 }
1605 1676
1606 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1607 return; 1678 return;
@@ -1627,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1627 return; 1698 return;
1628 1699
1629 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1630 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1631 resched_task(curr); 1702 goto preempt;
1632 return;
1633 }
1634 1703
1635 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1636 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1637 (se->avg_overlap < sysctl_sched_migration_cost &&
1638 pse->avg_overlap < sysctl_sched_migration_cost))) {
1639 resched_task(curr);
1640 return;
1641 }
1642 1706
1643 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1644 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1645 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1646 resched_task(curr); 1710 goto preempt;
1647 return;
1648 }
1649 }
1650 1711
1651 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1652 return; 1713 return;
1653 1714
1715 update_curr(cfs_rq);
1654 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1655
1656 BUG_ON(!pse); 1717 BUG_ON(!pse);
1718 if (wakeup_preempt_entity(se, pse) == 1)
1719 goto preempt;
1657 1720
1658 if (wakeup_preempt_entity(se, pse) == 1) { 1721 return;
1659 resched_task(curr); 1722
1660 /* 1723preempt:
1661 * Only set the backward buddy when the current task is still 1724 resched_task(curr);
1662 * on the rq. This can happen when a wakeup gets interleaved 1725 /*
1663 * with schedule on the ->pre_schedule() or idle_balance() 1726 * Only set the backward buddy when the current task is still
1664 * point, either of which can * drop the rq lock. 1727 * on the rq. This can happen when a wakeup gets interleaved
1665 * 1728 * with schedule on the ->pre_schedule() or idle_balance()
1666 * Also, during early boot the idle thread is in the fair class, 1729 * point, either of which can * drop the rq lock.
1667 * for obvious reasons its a bad idea to schedule back to it. 1730 *
1668 */ 1731 * Also, during early boot the idle thread is in the fair class,
1669 if (unlikely(!se->on_rq || curr == rq->idle)) 1732 * for obvious reasons its a bad idea to schedule back to it.
1670 return; 1733 */
1671 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1734 if (unlikely(!se->on_rq || curr == rq->idle))
1672 set_last_buddy(se); 1735 return;
1673 } 1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1674} 1739}
1675 1740
1676static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1744,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1679 struct cfs_rq *cfs_rq = &rq->cfs; 1744 struct cfs_rq *cfs_rq = &rq->cfs;
1680 struct sched_entity *se; 1745 struct sched_entity *se;
1681 1746
1682 if (unlikely(!cfs_rq->nr_running)) 1747 if (!cfs_rq->nr_running)
1683 return NULL; 1748 return NULL;
1684 1749
1685 do { 1750 do {
@@ -1850,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1850 1915
1851 return 0; 1916 return 0;
1852} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1853#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1854 1930
1855/* 1931/*
@@ -1867,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1867} 1943}
1868 1944
1869/* 1945/*
1870 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1871 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1872 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1873 * monopolize the CPU. Note: the parent runqueue is locked,
1874 * the child is not running yet.
1875 */ 1949 */
1876static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1877{ 1951{
1878 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1879 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1880 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 raw_spin_lock_irqsave(&rq->lock, flags);
1881 1959
1882 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1883 1962
1884 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1885 if (curr) 1965 if (curr)
1886 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1887 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1888 1968
1889 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1890 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1891 curr && entity_before(curr, se)) {
1892 /* 1970 /*
1893 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1894 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1897,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1897 resched_task(rq->curr); 1975 resched_task(rq->curr);
1898 } 1976 }
1899 1977
1900 enqueue_task_fair(rq, p, 0); 1978 raw_spin_unlock_irqrestore(&rq->lock, flags);
1901} 1979}
1902 1980
1903/* 1981/*
@@ -1959,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1959} 2037}
1960#endif 2038#endif
1961 2039
1962unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1963{ 2041{
1964 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
1965 unsigned long flags;
1966 struct rq *rq;
1967 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
1968 2044
1969 /* 2045 /*
1970 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1971 * idle runqueue: 2047 * idle runqueue:
1972 */ 2048 */
1973 rq = task_rq_lock(task, &flags);
1974 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
1975 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1976 task_rq_unlock(rq, &flags);
1977 2051
1978 return rr_interval; 2052 return rr_interval;
1979} 2053}
@@ -1997,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1997 2071
1998 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
1999 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
2000#endif 2076#endif
2001 2077
2002 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
2003 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
2004 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
2005 2081
2006 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
2007 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,