aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/clock.c107
-rw-r--r--kernel/sched/core.c880
-rw-r--r--kernel/sched/cpuacct.c18
-rw-r--r--kernel/sched/cpudeadline.c216
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/deadline.c1639
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c109
-rw-r--r--kernel/sched/rt.c10
-rw-r--r--kernel/sched/sched.h145
-rw-r--r--kernel/sched/stop_task.c2
12 files changed, 2931 insertions, 237 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15obj-y += wait.o completion.o 16obj-y += wait.o completion.o
16obj-$(CONFIG_SMP) += cpupri.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
18obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
19obj-$(CONFIG_SCHED_DEBUG) += debug.o 20obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c3ae1446461c..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current cpu.
31 * 30 *
31 * sched_clock_cpu(i)
32 *
32 * How: 33 * How:
33 * 34 *
34 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 51 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
51 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
52 * 53 *
53 *
54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
62 */ 54 */
63#include <linux/spinlock.h> 55#include <linux/spinlock.h>
64#include <linux/hardirq.h> 56#include <linux/hardirq.h>
@@ -66,6 +58,8 @@
66#include <linux/percpu.h> 58#include <linux/percpu.h>
67#include <linux/ktime.h> 59#include <linux/ktime.h>
68#include <linux/sched.h> 60#include <linux/sched.h>
61#include <linux/static_key.h>
62#include <linux/workqueue.h>
69 63
70/* 64/*
71 * Scheduler clock - returns current time in nanosec units. 65 * Scheduler clock - returns current time in nanosec units.
@@ -82,7 +76,52 @@ EXPORT_SYMBOL_GPL(sched_clock);
82__read_mostly int sched_clock_running; 76__read_mostly int sched_clock_running;
83 77
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 78#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 79static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
80static int __sched_clock_stable_early;
81
82int sched_clock_stable(void)
83{
84 return static_key_false(&__sched_clock_stable);
85}
86
87static void __set_sched_clock_stable(void)
88{
89 if (!sched_clock_stable())
90 static_key_slow_inc(&__sched_clock_stable);
91}
92
93void set_sched_clock_stable(void)
94{
95 __sched_clock_stable_early = 1;
96
97 smp_mb(); /* matches sched_clock_init() */
98
99 if (!sched_clock_running)
100 return;
101
102 __set_sched_clock_stable();
103}
104
105static void __clear_sched_clock_stable(struct work_struct *work)
106{
107 /* XXX worry about clock continuity */
108 if (sched_clock_stable())
109 static_key_slow_dec(&__sched_clock_stable);
110}
111
112static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
113
114void clear_sched_clock_stable(void)
115{
116 __sched_clock_stable_early = 0;
117
118 smp_mb(); /* matches sched_clock_init() */
119
120 if (!sched_clock_running)
121 return;
122
123 schedule_work(&sched_clock_work);
124}
86 125
87struct sched_clock_data { 126struct sched_clock_data {
88 u64 tick_raw; 127 u64 tick_raw;
@@ -116,6 +155,20 @@ void sched_clock_init(void)
116 } 155 }
117 156
118 sched_clock_running = 1; 157 sched_clock_running = 1;
158
159 /*
160 * Ensure that it is impossible to not do a static_key update.
161 *
162 * Either {set,clear}_sched_clock_stable() must see sched_clock_running
163 * and do the update, or we must see their __sched_clock_stable_early
164 * and do the update, or both.
165 */
166 smp_mb(); /* matches {set,clear}_sched_clock_stable() */
167
168 if (__sched_clock_stable_early)
169 __set_sched_clock_stable();
170 else
171 __clear_sched_clock_stable(NULL);
119} 172}
120 173
121/* 174/*
@@ -242,20 +295,20 @@ u64 sched_clock_cpu(int cpu)
242 struct sched_clock_data *scd; 295 struct sched_clock_data *scd;
243 u64 clock; 296 u64 clock;
244 297
245 WARN_ON_ONCE(!irqs_disabled()); 298 if (sched_clock_stable())
246
247 if (sched_clock_stable)
248 return sched_clock(); 299 return sched_clock();
249 300
250 if (unlikely(!sched_clock_running)) 301 if (unlikely(!sched_clock_running))
251 return 0ull; 302 return 0ull;
252 303
304 preempt_disable_notrace();
253 scd = cpu_sdc(cpu); 305 scd = cpu_sdc(cpu);
254 306
255 if (cpu != smp_processor_id()) 307 if (cpu != smp_processor_id())
256 clock = sched_clock_remote(scd); 308 clock = sched_clock_remote(scd);
257 else 309 else
258 clock = sched_clock_local(scd); 310 clock = sched_clock_local(scd);
311 preempt_enable_notrace();
259 312
260 return clock; 313 return clock;
261} 314}
@@ -265,7 +318,7 @@ void sched_clock_tick(void)
265 struct sched_clock_data *scd; 318 struct sched_clock_data *scd;
266 u64 now, now_gtod; 319 u64 now, now_gtod;
267 320
268 if (sched_clock_stable) 321 if (sched_clock_stable())
269 return; 322 return;
270 323
271 if (unlikely(!sched_clock_running)) 324 if (unlikely(!sched_clock_running))
@@ -316,14 +369,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
316 */ 369 */
317u64 cpu_clock(int cpu) 370u64 cpu_clock(int cpu)
318{ 371{
319 u64 clock; 372 if (!sched_clock_stable())
320 unsigned long flags; 373 return sched_clock_cpu(cpu);
321
322 local_irq_save(flags);
323 clock = sched_clock_cpu(cpu);
324 local_irq_restore(flags);
325 374
326 return clock; 375 return sched_clock();
327} 376}
328 377
329/* 378/*
@@ -335,14 +384,10 @@ u64 cpu_clock(int cpu)
335 */ 384 */
336u64 local_clock(void) 385u64 local_clock(void)
337{ 386{
338 u64 clock; 387 if (!sched_clock_stable())
339 unsigned long flags; 388 return sched_clock_cpu(raw_smp_processor_id());
340 389
341 local_irq_save(flags); 390 return sched_clock();
342 clock = sched_clock_cpu(smp_processor_id());
343 local_irq_restore(flags);
344
345 return clock;
346} 391}
347 392
348#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 393#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -362,12 +407,12 @@ u64 sched_clock_cpu(int cpu)
362 407
363u64 cpu_clock(int cpu) 408u64 cpu_clock(int cpu)
364{ 409{
365 return sched_clock_cpu(cpu); 410 return sched_clock();
366} 411}
367 412
368u64 local_clock(void) 413u64 local_clock(void)
369{ 414{
370 return sched_clock_cpu(0); 415 return sched_clock();
371} 416}
372 417
373#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 418#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a88f4a485c5e..f5c6635b806c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299
300
301/* 299/*
302 * __task_rq_lock - lock the rq @p resides on. 300 * __task_rq_lock - lock the rq @p resides on.
303 */ 301 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 897{
900 int prio; 898 int prio;
901 899
902 if (task_has_rt_policy(p)) 900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 904 else
905 prio = __normal_prio(p); 905 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 if (prev_class->switched_from) 945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p); 946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p); 947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio) 948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio); 949 p->sched_class->prio_changed(rq, p, oldprio);
950} 950}
951 951
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out; 1109 goto out;
1110 1110
1111 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112 1113
1113out: 1114out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1500 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI. 1501 * this IPI.
1501 */ 1502 */
1502 if (tif_need_resched()) 1503 preempt_fold_need_resched();
1503 set_preempt_need_resched();
1504 1504
1505 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1718#endif
1719 1719
1720 RB_CLEAR_NODE(&p->dl.rb_node);
1721 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1722 p->dl.dl_runtime = p->dl.runtime = 0;
1723 p->dl.dl_deadline = p->dl.deadline = 0;
1724 p->dl.dl_period = 0;
1725 p->dl.flags = 0;
1726
1720 INIT_LIST_HEAD(&p->rt.run_list); 1727 INIT_LIST_HEAD(&p->rt.run_list);
1721 1728
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1729#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled)
1763 numabalancing_enabled = enabled; 1770 numabalancing_enabled = enabled;
1764} 1771}
1765#endif /* CONFIG_SCHED_DEBUG */ 1772#endif /* CONFIG_SCHED_DEBUG */
1766#endif /* CONFIG_NUMA_BALANCING */ 1773
1774#ifdef CONFIG_PROC_SYSCTL
1775int sysctl_numa_balancing(struct ctl_table *table, int write,
1776 void __user *buffer, size_t *lenp, loff_t *ppos)
1777{
1778 struct ctl_table t;
1779 int err;
1780 int state = numabalancing_enabled;
1781
1782 if (write && !capable(CAP_SYS_ADMIN))
1783 return -EPERM;
1784
1785 t = *table;
1786 t.data = &state;
1787 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1788 if (err < 0)
1789 return err;
1790 if (write)
1791 set_numabalancing_state(state);
1792 return err;
1793}
1794#endif
1795#endif
1767 1796
1768/* 1797/*
1769 * fork()/clone()-time setup: 1798 * fork()/clone()-time setup:
1770 */ 1799 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1800int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1801{
1773 unsigned long flags; 1802 unsigned long flags;
1774 int cpu = get_cpu(); 1803 int cpu = get_cpu();
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1819 * Revert to default priority/policy on fork if requested.
1791 */ 1820 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1821 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1822 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1823 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1824 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1825 p->rt_priority = 0;
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1836 p->sched_reset_on_fork = 0;
1808 } 1837 }
1809 1838
1810 if (!rt_prio(p->prio)) 1839 if (dl_prio(p->prio)) {
1840 put_cpu();
1841 return -EAGAIN;
1842 } else if (rt_prio(p->prio)) {
1843 p->sched_class = &rt_sched_class;
1844 } else {
1811 p->sched_class = &fair_sched_class; 1845 p->sched_class = &fair_sched_class;
1846 }
1812 1847
1813 if (p->sched_class->task_fork) 1848 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1849 p->sched_class->task_fork(p);
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1834 init_task_preempt_count(p); 1869 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP 1870#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1871 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1872 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1837#endif 1873#endif
1838 1874
1839 put_cpu(); 1875 put_cpu();
1876 return 0;
1877}
1878
1879unsigned long to_ratio(u64 period, u64 runtime)
1880{
1881 if (runtime == RUNTIME_INF)
1882 return 1ULL << 20;
1883
1884 /*
1885 * Doing this here saves a lot of checks in all
1886 * the calling paths, and returning zero seems
1887 * safe for them anyway.
1888 */
1889 if (period == 0)
1890 return 0;
1891
1892 return div64_u64(runtime << 20, period);
1840} 1893}
1841 1894
1895#ifdef CONFIG_SMP
1896inline struct dl_bw *dl_bw_of(int i)
1897{
1898 return &cpu_rq(i)->rd->dl_bw;
1899}
1900
1901static inline int dl_bw_cpus(int i)
1902{
1903 struct root_domain *rd = cpu_rq(i)->rd;
1904 int cpus = 0;
1905
1906 for_each_cpu_and(i, rd->span, cpu_active_mask)
1907 cpus++;
1908
1909 return cpus;
1910}
1911#else
1912inline struct dl_bw *dl_bw_of(int i)
1913{
1914 return &cpu_rq(i)->dl.dl_bw;
1915}
1916
1917static inline int dl_bw_cpus(int i)
1918{
1919 return 1;
1920}
1921#endif
1922
1923static inline
1924void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1925{
1926 dl_b->total_bw -= tsk_bw;
1927}
1928
1929static inline
1930void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1931{
1932 dl_b->total_bw += tsk_bw;
1933}
1934
1935static inline
1936bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1937{
1938 return dl_b->bw != -1 &&
1939 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1940}
1941
1942/*
1943 * We must be sure that accepting a new task (or allowing changing the
1944 * parameters of an existing one) is consistent with the bandwidth
1945 * constraints. If yes, this function also accordingly updates the currently
1946 * allocated bandwidth to reflect the new situation.
1947 *
1948 * This function is called while holding p's rq->lock.
1949 */
1950static int dl_overflow(struct task_struct *p, int policy,
1951 const struct sched_attr *attr)
1952{
1953
1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955 u64 period = attr->sched_period ?: attr->sched_deadline;
1956 u64 runtime = attr->sched_runtime;
1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958 int cpus, err = -1;
1959
1960 if (new_bw == p->dl.dl_bw)
1961 return 0;
1962
1963 /*
1964 * Either if a task, enters, leave, or stays -deadline but changes
1965 * its parameters, we may need to update accordingly the total
1966 * allocated bandwidth of the container.
1967 */
1968 raw_spin_lock(&dl_b->lock);
1969 cpus = dl_bw_cpus(task_cpu(p));
1970 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1971 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1972 __dl_add(dl_b, new_bw);
1973 err = 0;
1974 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1975 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1976 __dl_clear(dl_b, p->dl.dl_bw);
1977 __dl_add(dl_b, new_bw);
1978 err = 0;
1979 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1980 __dl_clear(dl_b, p->dl.dl_bw);
1981 err = 0;
1982 }
1983 raw_spin_unlock(&dl_b->lock);
1984
1985 return err;
1986}
1987
1988extern void init_dl_bw(struct dl_bw *dl_b);
1989
1842/* 1990/*
1843 * wake_up_new_task - wake up a newly created task for the first time. 1991 * wake_up_new_task - wake up a newly created task for the first time.
1844 * 1992 *
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2003 if (unlikely(prev_state == TASK_DEAD)) { 2151 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev); 2152 task_numa_free(prev);
2005 2153
2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev);
2156
2006 /* 2157 /*
2007 * Remove function-return probe instances associated with this 2158 * Remove function-return probe instances associated with this
2008 * task and put them back on the free list. 2159 * task and put them back on the free list.
@@ -2296,7 +2447,7 @@ void scheduler_tick(void)
2296 2447
2297#ifdef CONFIG_SMP 2448#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu); 2449 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu); 2450 trigger_load_balance(rq);
2300#endif 2451#endif
2301 rq_last_tick_reset(rq); 2452 rq_last_tick_reset(rq);
2302} 2453}
@@ -2325,7 +2476,7 @@ u64 scheduler_tick_max_deferment(void)
2325 if (time_before_eq(next, now)) 2476 if (time_before_eq(next, now))
2326 return 0; 2477 return 0;
2327 2478
2328 return jiffies_to_usecs(next - now) * NSEC_PER_USEC; 2479 return jiffies_to_nsecs(next - now);
2329} 2480}
2330#endif 2481#endif
2331 2482
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev)
2414{ 2565{
2415 /* 2566 /*
2416 * Test if we are atomic. Since do_exit() needs to call into 2567 * Test if we are atomic. Since do_exit() needs to call into
2417 * schedule() atomically, we ignore that path for now. 2568 * schedule() atomically, we ignore that path. Otherwise whine
2418 * Otherwise, whine if we are scheduling when we should not be. 2569 * if we are scheduling when we should not.
2419 */ 2570 */
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2571 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2421 __schedule_bug(prev); 2572 __schedule_bug(prev);
2422 rcu_sleep_check(); 2573 rcu_sleep_check();
2423 2574
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
2761 */ 2912 */
2762void rt_mutex_setprio(struct task_struct *p, int prio) 2913void rt_mutex_setprio(struct task_struct *p, int prio)
2763{ 2914{
2764 int oldprio, on_rq, running; 2915 int oldprio, on_rq, running, enqueue_flag = 0;
2765 struct rq *rq; 2916 struct rq *rq;
2766 const struct sched_class *prev_class; 2917 const struct sched_class *prev_class;
2767 2918
2768 BUG_ON(prio < 0 || prio > MAX_PRIO); 2919 BUG_ON(prio > MAX_PRIO);
2769 2920
2770 rq = __task_rq_lock(p); 2921 rq = __task_rq_lock(p);
2771 2922
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2788 } 2939 }
2789 2940
2790 trace_sched_pi_setprio(p, prio); 2941 trace_sched_pi_setprio(p, prio);
2942 p->pi_top_task = rt_mutex_get_top_task(p);
2791 oldprio = p->prio; 2943 oldprio = p->prio;
2792 prev_class = p->sched_class; 2944 prev_class = p->sched_class;
2793 on_rq = p->on_rq; 2945 on_rq = p->on_rq;
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2797 if (running) 2949 if (running)
2798 p->sched_class->put_prev_task(rq, p); 2950 p->sched_class->put_prev_task(rq, p);
2799 2951
2800 if (rt_prio(prio)) 2952 /*
2953 * Boosting condition are:
2954 * 1. -rt task is running and holds mutex A
2955 * --> -dl task blocks on mutex A
2956 *
2957 * 2. -dl task is running and holds mutex A
2958 * --> -dl task blocks on mutex A and could preempt the
2959 * running task
2960 */
2961 if (dl_prio(prio)) {
2962 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2963 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2964 p->dl.dl_boosted = 1;
2965 p->dl.dl_throttled = 0;
2966 enqueue_flag = ENQUEUE_REPLENISH;
2967 } else
2968 p->dl.dl_boosted = 0;
2969 p->sched_class = &dl_sched_class;
2970 } else if (rt_prio(prio)) {
2971 if (dl_prio(oldprio))
2972 p->dl.dl_boosted = 0;
2973 if (oldprio < prio)
2974 enqueue_flag = ENQUEUE_HEAD;
2801 p->sched_class = &rt_sched_class; 2975 p->sched_class = &rt_sched_class;
2802 else 2976 } else {
2977 if (dl_prio(oldprio))
2978 p->dl.dl_boosted = 0;
2803 p->sched_class = &fair_sched_class; 2979 p->sched_class = &fair_sched_class;
2980 }
2804 2981
2805 p->prio = prio; 2982 p->prio = prio;
2806 2983
2807 if (running) 2984 if (running)
2808 p->sched_class->set_curr_task(rq); 2985 p->sched_class->set_curr_task(rq);
2809 if (on_rq) 2986 if (on_rq)
2810 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2987 enqueue_task(rq, p, enqueue_flag);
2811 2988
2812 check_class_changed(rq, p, prev_class, oldprio); 2989 check_class_changed(rq, p, prev_class, oldprio);
2813out_unlock: 2990out_unlock:
2814 __task_rq_unlock(rq); 2991 __task_rq_unlock(rq);
2815} 2992}
2816#endif 2993#endif
2994
2817void set_user_nice(struct task_struct *p, long nice) 2995void set_user_nice(struct task_struct *p, long nice)
2818{ 2996{
2819 int old_prio, delta, on_rq; 2997 int old_prio, delta, on_rq;
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice)
2831 * The RT priorities are set via sched_setscheduler(), but we still 3009 * The RT priorities are set via sched_setscheduler(), but we still
2832 * allow the 'normal' nice value to be set - but as expected 3010 * allow the 'normal' nice value to be set - but as expected
2833 * it wont have any effect on scheduling until the task is 3011 * it wont have any effect on scheduling until the task is
2834 * SCHED_FIFO/SCHED_RR: 3012 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2835 */ 3013 */
2836 if (task_has_rt_policy(p)) { 3014 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2837 p->static_prio = NICE_TO_PRIO(nice); 3015 p->static_prio = NICE_TO_PRIO(nice);
2838 goto out_unlock; 3016 goto out_unlock;
2839 } 3017 }
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2988 return pid ? find_task_by_vpid(pid) : current; 3166 return pid ? find_task_by_vpid(pid) : current;
2989} 3167}
2990 3168
2991/* Actually do priority change: must hold rq lock. */ 3169/*
3170 * This function initializes the sched_dl_entity of a newly becoming
3171 * SCHED_DEADLINE task.
3172 *
3173 * Only the static values are considered here, the actual runtime and the
3174 * absolute deadline will be properly calculated when the task is enqueued
3175 * for the first time with its new policy.
3176 */
2992static void 3177static void
2993__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3178__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
2994{ 3179{
3180 struct sched_dl_entity *dl_se = &p->dl;
3181
3182 init_dl_task_timer(dl_se);
3183 dl_se->dl_runtime = attr->sched_runtime;
3184 dl_se->dl_deadline = attr->sched_deadline;
3185 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3186 dl_se->flags = attr->sched_flags;
3187 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3188 dl_se->dl_throttled = 0;
3189 dl_se->dl_new = 1;
3190}
3191
3192/* Actually do priority change: must hold pi & rq lock. */
3193static void __setscheduler(struct rq *rq, struct task_struct *p,
3194 const struct sched_attr *attr)
3195{
3196 int policy = attr->sched_policy;
3197
3198 if (policy == -1) /* setparam */
3199 policy = p->policy;
3200
2995 p->policy = policy; 3201 p->policy = policy;
2996 p->rt_priority = prio; 3202
3203 if (dl_policy(policy))
3204 __setparam_dl(p, attr);
3205 else if (fair_policy(policy))
3206 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3207
3208 /*
3209 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3210 * !rt_policy. Always setting this ensures that things like
3211 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */
3213 p->rt_priority = attr->sched_priority;
3214
2997 p->normal_prio = normal_prio(p); 3215 p->normal_prio = normal_prio(p);
2998 /* we are holding p->pi_lock already */
2999 p->prio = rt_mutex_getprio(p); 3216 p->prio = rt_mutex_getprio(p);
3000 if (rt_prio(p->prio)) 3217
3218 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class;
3220 else if (rt_prio(p->prio))
3001 p->sched_class = &rt_sched_class; 3221 p->sched_class = &rt_sched_class;
3002 else 3222 else
3003 p->sched_class = &fair_sched_class; 3223 p->sched_class = &fair_sched_class;
3224
3004 set_load_weight(p); 3225 set_load_weight(p);
3005} 3226}
3006 3227
3228static void
3229__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3230{
3231 struct sched_dl_entity *dl_se = &p->dl;
3232
3233 attr->sched_priority = p->rt_priority;
3234 attr->sched_runtime = dl_se->dl_runtime;
3235 attr->sched_deadline = dl_se->dl_deadline;
3236 attr->sched_period = dl_se->dl_period;
3237 attr->sched_flags = dl_se->flags;
3238}
3239
3240/*
3241 * This function validates the new parameters of a -deadline task.
3242 * We ask for the deadline not being zero, and greater or equal
3243 * than the runtime, as well as the period of being zero or
3244 * greater than deadline. Furthermore, we have to be sure that
3245 * user parameters are above the internal resolution (1us); we
3246 * check sched_runtime only since it is always the smaller one.
3247 */
3248static bool
3249__checkparam_dl(const struct sched_attr *attr)
3250{
3251 return attr && attr->sched_deadline != 0 &&
3252 (attr->sched_period == 0 ||
3253 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3254 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3255 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3256}
3257
3007/* 3258/*
3008 * check the target process has a UID that matches the current process's 3259 * check the target process has a UID that matches the current process's
3009 */ 3260 */
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p)
3020 return match; 3271 return match;
3021} 3272}
3022 3273
3023static int __sched_setscheduler(struct task_struct *p, int policy, 3274static int __sched_setscheduler(struct task_struct *p,
3024 const struct sched_param *param, bool user) 3275 const struct sched_attr *attr,
3276 bool user)
3025{ 3277{
3026 int retval, oldprio, oldpolicy = -1, on_rq, running; 3278 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy;
3027 unsigned long flags; 3280 unsigned long flags;
3028 const struct sched_class *prev_class; 3281 const struct sched_class *prev_class;
3029 struct rq *rq; 3282 struct rq *rq;
@@ -3037,31 +3290,40 @@ recheck:
3037 reset_on_fork = p->sched_reset_on_fork; 3290 reset_on_fork = p->sched_reset_on_fork;
3038 policy = oldpolicy = p->policy; 3291 policy = oldpolicy = p->policy;
3039 } else { 3292 } else {
3040 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3293 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3041 policy &= ~SCHED_RESET_ON_FORK;
3042 3294
3043 if (policy != SCHED_FIFO && policy != SCHED_RR && 3295 if (policy != SCHED_DEADLINE &&
3296 policy != SCHED_FIFO && policy != SCHED_RR &&
3044 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3297 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3045 policy != SCHED_IDLE) 3298 policy != SCHED_IDLE)
3046 return -EINVAL; 3299 return -EINVAL;
3047 } 3300 }
3048 3301
3302 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3303 return -EINVAL;
3304
3049 /* 3305 /*
3050 * Valid priorities for SCHED_FIFO and SCHED_RR are 3306 * Valid priorities for SCHED_FIFO and SCHED_RR are
3051 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3307 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3052 * SCHED_BATCH and SCHED_IDLE is 0. 3308 * SCHED_BATCH and SCHED_IDLE is 0.
3053 */ 3309 */
3054 if (param->sched_priority < 0 || 3310 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3055 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3311 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3056 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3057 return -EINVAL; 3312 return -EINVAL;
3058 if (rt_policy(policy) != (param->sched_priority != 0)) 3313 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3314 (rt_policy(policy) != (attr->sched_priority != 0)))
3059 return -EINVAL; 3315 return -EINVAL;
3060 3316
3061 /* 3317 /*
3062 * Allow unprivileged RT tasks to decrease priority: 3318 * Allow unprivileged RT tasks to decrease priority:
3063 */ 3319 */
3064 if (user && !capable(CAP_SYS_NICE)) { 3320 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) &&
3323 !can_nice(p, attr->sched_nice))
3324 return -EPERM;
3325 }
3326
3065 if (rt_policy(policy)) { 3327 if (rt_policy(policy)) {
3066 unsigned long rlim_rtprio = 3328 unsigned long rlim_rtprio =
3067 task_rlimit(p, RLIMIT_RTPRIO); 3329 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,11 +3333,20 @@ recheck:
3071 return -EPERM; 3333 return -EPERM;
3072 3334
3073 /* can't increase priority */ 3335 /* can't increase priority */
3074 if (param->sched_priority > p->rt_priority && 3336 if (attr->sched_priority > p->rt_priority &&
3075 param->sched_priority > rlim_rtprio) 3337 attr->sched_priority > rlim_rtprio)
3076 return -EPERM; 3338 return -EPERM;
3077 } 3339 }
3078 3340
3341 /*
3342 * Can't set/change SCHED_DEADLINE policy at all for now
3343 * (safest behavior); in the future we would like to allow
3344 * unprivileged DL tasks to increase their relative deadline
3345 * or reduce their runtime (both ways reducing utilization)
3346 */
3347 if (dl_policy(policy))
3348 return -EPERM;
3349
3079 /* 3350 /*
3080 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3351 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3081 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3352 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
@@ -3120,14 +3391,21 @@ recheck:
3120 /* 3391 /*
3121 * If not changing anything there's no need to proceed further: 3392 * If not changing anything there's no need to proceed further:
3122 */ 3393 */
3123 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3394 if (unlikely(policy == p->policy)) {
3124 param->sched_priority == p->rt_priority))) { 3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3396 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change;
3399 if (dl_policy(policy))
3400 goto change;
3401
3125 task_rq_unlock(rq, p, &flags); 3402 task_rq_unlock(rq, p, &flags);
3126 return 0; 3403 return 0;
3127 } 3404 }
3405change:
3128 3406
3129#ifdef CONFIG_RT_GROUP_SCHED
3130 if (user) { 3407 if (user) {
3408#ifdef CONFIG_RT_GROUP_SCHED
3131 /* 3409 /*
3132 * Do not allow realtime tasks into groups that have no runtime 3410 * Do not allow realtime tasks into groups that have no runtime
3133 * assigned. 3411 * assigned.
@@ -3138,8 +3416,24 @@ recheck:
3138 task_rq_unlock(rq, p, &flags); 3416 task_rq_unlock(rq, p, &flags);
3139 return -EPERM; 3417 return -EPERM;
3140 } 3418 }
3141 }
3142#endif 3419#endif
3420#ifdef CONFIG_SMP
3421 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3422 cpumask_t *span = rq->rd->span;
3423
3424 /*
3425 * Don't allow tasks with an affinity mask smaller than
3426 * the entire root_domain to become SCHED_DEADLINE. We
3427 * will also fail if there's no bandwidth available.
3428 */
3429 if (!cpumask_subset(span, &p->cpus_allowed) ||
3430 rq->rd->dl_bw.bw == 0) {
3431 task_rq_unlock(rq, p, &flags);
3432 return -EPERM;
3433 }
3434 }
3435#endif
3436 }
3143 3437
3144 /* recheck policy now with rq lock held */ 3438 /* recheck policy now with rq lock held */
3145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3439 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3441,17 @@ recheck:
3147 task_rq_unlock(rq, p, &flags); 3441 task_rq_unlock(rq, p, &flags);
3148 goto recheck; 3442 goto recheck;
3149 } 3443 }
3444
3445 /*
3446 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3447 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3448 * is available.
3449 */
3450 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3451 task_rq_unlock(rq, p, &flags);
3452 return -EBUSY;
3453 }
3454
3150 on_rq = p->on_rq; 3455 on_rq = p->on_rq;
3151 running = task_current(rq, p); 3456 running = task_current(rq, p);
3152 if (on_rq) 3457 if (on_rq)
@@ -3158,7 +3463,7 @@ recheck:
3158 3463
3159 oldprio = p->prio; 3464 oldprio = p->prio;
3160 prev_class = p->sched_class; 3465 prev_class = p->sched_class;
3161 __setscheduler(rq, p, policy, param->sched_priority); 3466 __setscheduler(rq, p, attr);
3162 3467
3163 if (running) 3468 if (running)
3164 p->sched_class->set_curr_task(rq); 3469 p->sched_class->set_curr_task(rq);
@@ -3173,6 +3478,26 @@ recheck:
3173 return 0; 3478 return 0;
3174} 3479}
3175 3480
3481static int _sched_setscheduler(struct task_struct *p, int policy,
3482 const struct sched_param *param, bool check)
3483{
3484 struct sched_attr attr = {
3485 .sched_policy = policy,
3486 .sched_priority = param->sched_priority,
3487 .sched_nice = PRIO_TO_NICE(p->static_prio),
3488 };
3489
3490 /*
3491 * Fixup the legacy SCHED_RESET_ON_FORK hack
3492 */
3493 if (policy & SCHED_RESET_ON_FORK) {
3494 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3495 policy &= ~SCHED_RESET_ON_FORK;
3496 attr.sched_policy = policy;
3497 }
3498
3499 return __sched_setscheduler(p, &attr, check);
3500}
3176/** 3501/**
3177 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3502 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3178 * @p: the task in question. 3503 * @p: the task in question.
@@ -3186,10 +3511,16 @@ recheck:
3186int sched_setscheduler(struct task_struct *p, int policy, 3511int sched_setscheduler(struct task_struct *p, int policy,
3187 const struct sched_param *param) 3512 const struct sched_param *param)
3188{ 3513{
3189 return __sched_setscheduler(p, policy, param, true); 3514 return _sched_setscheduler(p, policy, param, true);
3190} 3515}
3191EXPORT_SYMBOL_GPL(sched_setscheduler); 3516EXPORT_SYMBOL_GPL(sched_setscheduler);
3192 3517
3518int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3519{
3520 return __sched_setscheduler(p, attr, true);
3521}
3522EXPORT_SYMBOL_GPL(sched_setattr);
3523
3193/** 3524/**
3194 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3525 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3195 * @p: the task in question. 3526 * @p: the task in question.
@@ -3206,7 +3537,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3206int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3537int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3207 const struct sched_param *param) 3538 const struct sched_param *param)
3208{ 3539{
3209 return __sched_setscheduler(p, policy, param, false); 3540 return _sched_setscheduler(p, policy, param, false);
3210} 3541}
3211 3542
3212static int 3543static int
@@ -3231,6 +3562,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3231 return retval; 3562 return retval;
3232} 3563}
3233 3564
3565/*
3566 * Mimics kernel/events/core.c perf_copy_attr().
3567 */
3568static int sched_copy_attr(struct sched_attr __user *uattr,
3569 struct sched_attr *attr)
3570{
3571 u32 size;
3572 int ret;
3573
3574 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3575 return -EFAULT;
3576
3577 /*
3578 * zero the full structure, so that a short copy will be nice.
3579 */
3580 memset(attr, 0, sizeof(*attr));
3581
3582 ret = get_user(size, &uattr->size);
3583 if (ret)
3584 return ret;
3585
3586 if (size > PAGE_SIZE) /* silly large */
3587 goto err_size;
3588
3589 if (!size) /* abi compat */
3590 size = SCHED_ATTR_SIZE_VER0;
3591
3592 if (size < SCHED_ATTR_SIZE_VER0)
3593 goto err_size;
3594
3595 /*
3596 * If we're handed a bigger struct than we know of,
3597 * ensure all the unknown bits are 0 - i.e. new
3598 * user-space does not rely on any kernel feature
3599 * extensions we dont know about yet.
3600 */
3601 if (size > sizeof(*attr)) {
3602 unsigned char __user *addr;
3603 unsigned char __user *end;
3604 unsigned char val;
3605
3606 addr = (void __user *)uattr + sizeof(*attr);
3607 end = (void __user *)uattr + size;
3608
3609 for (; addr < end; addr++) {
3610 ret = get_user(val, addr);
3611 if (ret)
3612 return ret;
3613 if (val)
3614 goto err_size;
3615 }
3616 size = sizeof(*attr);
3617 }
3618
3619 ret = copy_from_user(attr, uattr, size);
3620 if (ret)
3621 return -EFAULT;
3622
3623 /*
3624 * XXX: do we want to be lenient like existing syscalls; or do we want
3625 * to be strict and return an error on out-of-bounds values?
3626 */
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3628
3629out:
3630 return ret;
3631
3632err_size:
3633 put_user(sizeof(*attr), &uattr->size);
3634 ret = -E2BIG;
3635 goto out;
3636}
3637
3234/** 3638/**
3235 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3639 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3236 * @pid: the pid in question. 3640 * @pid: the pid in question.
@@ -3262,6 +3666,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3262} 3666}
3263 3667
3264/** 3668/**
3669 * sys_sched_setattr - same as above, but with extended sched_attr
3670 * @pid: the pid in question.
3671 * @uattr: structure containing the extended parameters.
3672 */
3673SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3674 unsigned int, flags)
3675{
3676 struct sched_attr attr;
3677 struct task_struct *p;
3678 int retval;
3679
3680 if (!uattr || pid < 0 || flags)
3681 return -EINVAL;
3682
3683 if (sched_copy_attr(uattr, &attr))
3684 return -EFAULT;
3685
3686 rcu_read_lock();
3687 retval = -ESRCH;
3688 p = find_process_by_pid(pid);
3689 if (p != NULL)
3690 retval = sched_setattr(p, &attr);
3691 rcu_read_unlock();
3692
3693 return retval;
3694}
3695
3696/**
3265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3697 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3266 * @pid: the pid in question. 3698 * @pid: the pid in question.
3267 * 3699 *
@@ -3316,6 +3748,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3316 if (retval) 3748 if (retval)
3317 goto out_unlock; 3749 goto out_unlock;
3318 3750
3751 if (task_has_dl_policy(p)) {
3752 retval = -EINVAL;
3753 goto out_unlock;
3754 }
3319 lp.sched_priority = p->rt_priority; 3755 lp.sched_priority = p->rt_priority;
3320 rcu_read_unlock(); 3756 rcu_read_unlock();
3321 3757
@@ -3331,6 +3767,96 @@ out_unlock:
3331 return retval; 3767 return retval;
3332} 3768}
3333 3769
3770static int sched_read_attr(struct sched_attr __user *uattr,
3771 struct sched_attr *attr,
3772 unsigned int usize)
3773{
3774 int ret;
3775
3776 if (!access_ok(VERIFY_WRITE, uattr, usize))
3777 return -EFAULT;
3778
3779 /*
3780 * If we're handed a smaller struct than we know of,
3781 * ensure all the unknown bits are 0 - i.e. old
3782 * user-space does not get uncomplete information.
3783 */
3784 if (usize < sizeof(*attr)) {
3785 unsigned char *addr;
3786 unsigned char *end;
3787
3788 addr = (void *)attr + usize;
3789 end = (void *)attr + sizeof(*attr);
3790
3791 for (; addr < end; addr++) {
3792 if (*addr)
3793 goto err_size;
3794 }
3795
3796 attr->size = usize;
3797 }
3798
3799 ret = copy_to_user(uattr, attr, attr->size);
3800 if (ret)
3801 return -EFAULT;
3802
3803out:
3804 return ret;
3805
3806err_size:
3807 ret = -E2BIG;
3808 goto out;
3809}
3810
3811/**
3812 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3813 * @pid: the pid in question.
3814 * @uattr: structure containing the extended parameters.
3815 * @size: sizeof(attr) for fwd/bwd comp.
3816 */
3817SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3818 unsigned int, size, unsigned int, flags)
3819{
3820 struct sched_attr attr = {
3821 .size = sizeof(struct sched_attr),
3822 };
3823 struct task_struct *p;
3824 int retval;
3825
3826 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3827 size < SCHED_ATTR_SIZE_VER0 || flags)
3828 return -EINVAL;
3829
3830 rcu_read_lock();
3831 p = find_process_by_pid(pid);
3832 retval = -ESRCH;
3833 if (!p)
3834 goto out_unlock;
3835
3836 retval = security_task_getscheduler(p);
3837 if (retval)
3838 goto out_unlock;
3839
3840 attr.sched_policy = p->policy;
3841 if (p->sched_reset_on_fork)
3842 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3843 if (task_has_dl_policy(p))
3844 __getparam_dl(p, &attr);
3845 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority;
3847 else
3848 attr.sched_nice = TASK_NICE(p);
3849
3850 rcu_read_unlock();
3851
3852 retval = sched_read_attr(uattr, &attr, size);
3853 return retval;
3854
3855out_unlock:
3856 rcu_read_unlock();
3857 return retval;
3858}
3859
3334long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3860long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3335{ 3861{
3336 cpumask_var_t cpus_allowed, new_mask; 3862 cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3901,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3375 if (retval) 3901 if (retval)
3376 goto out_unlock; 3902 goto out_unlock;
3377 3903
3904
3378 cpuset_cpus_allowed(p, cpus_allowed); 3905 cpuset_cpus_allowed(p, cpus_allowed);
3379 cpumask_and(new_mask, in_mask, cpus_allowed); 3906 cpumask_and(new_mask, in_mask, cpus_allowed);
3907
3908 /*
3909 * Since bandwidth control happens on root_domain basis,
3910 * if admission test is enabled, we only admit -deadline
3911 * tasks allowed to run on all the CPUs in the task's
3912 * root_domain.
3913 */
3914#ifdef CONFIG_SMP
3915 if (task_has_dl_policy(p)) {
3916 const struct cpumask *span = task_rq(p)->rd->span;
3917
3918 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3919 retval = -EBUSY;
3920 goto out_unlock;
3921 }
3922 }
3923#endif
3380again: 3924again:
3381 retval = set_cpus_allowed_ptr(p, new_mask); 3925 retval = set_cpus_allowed_ptr(p, new_mask);
3382 3926
@@ -3653,7 +4197,7 @@ again:
3653 } 4197 }
3654 4198
3655 double_rq_lock(rq, p_rq); 4199 double_rq_lock(rq, p_rq);
3656 while (task_rq(p) != p_rq) { 4200 if (task_rq(p) != p_rq) {
3657 double_rq_unlock(rq, p_rq); 4201 double_rq_unlock(rq, p_rq);
3658 goto again; 4202 goto again;
3659 } 4203 }
@@ -3742,6 +4286,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3742 case SCHED_RR: 4286 case SCHED_RR:
3743 ret = MAX_USER_RT_PRIO-1; 4287 ret = MAX_USER_RT_PRIO-1;
3744 break; 4288 break;
4289 case SCHED_DEADLINE:
3745 case SCHED_NORMAL: 4290 case SCHED_NORMAL:
3746 case SCHED_BATCH: 4291 case SCHED_BATCH:
3747 case SCHED_IDLE: 4292 case SCHED_IDLE:
@@ -3768,6 +4313,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3768 case SCHED_RR: 4313 case SCHED_RR:
3769 ret = 1; 4314 ret = 1;
3770 break; 4315 break;
4316 case SCHED_DEADLINE:
3771 case SCHED_NORMAL: 4317 case SCHED_NORMAL:
3772 case SCHED_BATCH: 4318 case SCHED_BATCH:
3773 case SCHED_IDLE: 4319 case SCHED_IDLE:
@@ -3811,7 +4357,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
3811 goto out_unlock; 4357 goto out_unlock;
3812 4358
3813 rq = task_rq_lock(p, &flags); 4359 rq = task_rq_lock(p, &flags);
3814 time_slice = p->sched_class->get_rr_interval(rq, p); 4360 time_slice = 0;
4361 if (p->sched_class->get_rr_interval)
4362 time_slice = p->sched_class->get_rr_interval(rq, p);
3815 task_rq_unlock(rq, p, &flags); 4363 task_rq_unlock(rq, p, &flags);
3816 4364
3817 rcu_read_unlock(); 4365 rcu_read_unlock();
@@ -4090,6 +4638,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4090 4638
4091 /* TODO: This is not properly updating schedstats */ 4639 /* TODO: This is not properly updating schedstats */
4092 4640
4641 trace_sched_move_numa(p, curr_cpu, target_cpu);
4093 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4642 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4094} 4643}
4095 4644
@@ -4514,13 +5063,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
4514static int sched_cpu_inactive(struct notifier_block *nfb, 5063static int sched_cpu_inactive(struct notifier_block *nfb,
4515 unsigned long action, void *hcpu) 5064 unsigned long action, void *hcpu)
4516{ 5065{
5066 unsigned long flags;
5067 long cpu = (long)hcpu;
5068
4517 switch (action & ~CPU_TASKS_FROZEN) { 5069 switch (action & ~CPU_TASKS_FROZEN) {
4518 case CPU_DOWN_PREPARE: 5070 case CPU_DOWN_PREPARE:
4519 set_cpu_active((long)hcpu, false); 5071 set_cpu_active(cpu, false);
5072
5073 /* explicitly allow suspend */
5074 if (!(action & CPU_TASKS_FROZEN)) {
5075 struct dl_bw *dl_b = dl_bw_of(cpu);
5076 bool overflow;
5077 int cpus;
5078
5079 raw_spin_lock_irqsave(&dl_b->lock, flags);
5080 cpus = dl_bw_cpus(cpu);
5081 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5082 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5083
5084 if (overflow)
5085 return notifier_from_errno(-EBUSY);
5086 }
4520 return NOTIFY_OK; 5087 return NOTIFY_OK;
4521 default:
4522 return NOTIFY_DONE;
4523 } 5088 }
5089
5090 return NOTIFY_DONE;
4524} 5091}
4525 5092
4526static int __init migration_init(void) 5093static int __init migration_init(void)
@@ -4739,6 +5306,8 @@ static void free_rootdomain(struct rcu_head *rcu)
4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5306 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4740 5307
4741 cpupri_cleanup(&rd->cpupri); 5308 cpupri_cleanup(&rd->cpupri);
5309 cpudl_cleanup(&rd->cpudl);
5310 free_cpumask_var(rd->dlo_mask);
4742 free_cpumask_var(rd->rto_mask); 5311 free_cpumask_var(rd->rto_mask);
4743 free_cpumask_var(rd->online); 5312 free_cpumask_var(rd->online);
4744 free_cpumask_var(rd->span); 5313 free_cpumask_var(rd->span);
@@ -4790,8 +5359,14 @@ static int init_rootdomain(struct root_domain *rd)
4790 goto out; 5359 goto out;
4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5360 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4792 goto free_span; 5361 goto free_span;
4793 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5362 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
4794 goto free_online; 5363 goto free_online;
5364 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5365 goto free_dlo_mask;
5366
5367 init_dl_bw(&rd->dl_bw);
5368 if (cpudl_init(&rd->cpudl) != 0)
5369 goto free_dlo_mask;
4795 5370
4796 if (cpupri_init(&rd->cpupri) != 0) 5371 if (cpupri_init(&rd->cpupri) != 0)
4797 goto free_rto_mask; 5372 goto free_rto_mask;
@@ -4799,6 +5374,8 @@ static int init_rootdomain(struct root_domain *rd)
4799 5374
4800free_rto_mask: 5375free_rto_mask:
4801 free_cpumask_var(rd->rto_mask); 5376 free_cpumask_var(rd->rto_mask);
5377free_dlo_mask:
5378 free_cpumask_var(rd->dlo_mask);
4802free_online: 5379free_online:
4803 free_cpumask_var(rd->online); 5380 free_cpumask_var(rd->online);
4804free_span: 5381free_span:
@@ -6150,6 +6727,7 @@ void __init sched_init_smp(void)
6150 free_cpumask_var(non_isolated_cpus); 6727 free_cpumask_var(non_isolated_cpus);
6151 6728
6152 init_sched_rt_class(); 6729 init_sched_rt_class();
6730 init_sched_dl_class();
6153} 6731}
6154#else 6732#else
6155void __init sched_init_smp(void) 6733void __init sched_init_smp(void)
@@ -6219,13 +6797,15 @@ void __init sched_init(void)
6219#endif /* CONFIG_CPUMASK_OFFSTACK */ 6797#endif /* CONFIG_CPUMASK_OFFSTACK */
6220 } 6798 }
6221 6799
6800 init_rt_bandwidth(&def_rt_bandwidth,
6801 global_rt_period(), global_rt_runtime());
6802 init_dl_bandwidth(&def_dl_bandwidth,
6803 global_rt_period(), global_rt_runtime());
6804
6222#ifdef CONFIG_SMP 6805#ifdef CONFIG_SMP
6223 init_defrootdomain(); 6806 init_defrootdomain();
6224#endif 6807#endif
6225 6808
6226 init_rt_bandwidth(&def_rt_bandwidth,
6227 global_rt_period(), global_rt_runtime());
6228
6229#ifdef CONFIG_RT_GROUP_SCHED 6809#ifdef CONFIG_RT_GROUP_SCHED
6230 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6810 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6231 global_rt_period(), global_rt_runtime()); 6811 global_rt_period(), global_rt_runtime());
@@ -6249,6 +6829,7 @@ void __init sched_init(void)
6249 rq->calc_load_update = jiffies + LOAD_FREQ; 6829 rq->calc_load_update = jiffies + LOAD_FREQ;
6250 init_cfs_rq(&rq->cfs); 6830 init_cfs_rq(&rq->cfs);
6251 init_rt_rq(&rq->rt, rq); 6831 init_rt_rq(&rq->rt, rq);
6832 init_dl_rq(&rq->dl, rq);
6252#ifdef CONFIG_FAIR_GROUP_SCHED 6833#ifdef CONFIG_FAIR_GROUP_SCHED
6253 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6834 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6835 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6901,6 @@ void __init sched_init(void)
6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6901 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6321#endif 6902#endif
6322 6903
6323#ifdef CONFIG_RT_MUTEXES
6324 plist_head_init(&init_task.pi_waiters);
6325#endif
6326
6327 /* 6904 /*
6328 * The boot idle thread does lazy MMU switching as well: 6905 * The boot idle thread does lazy MMU switching as well:
6329 */ 6906 */
@@ -6397,13 +6974,16 @@ EXPORT_SYMBOL(__might_sleep);
6397static void normalize_task(struct rq *rq, struct task_struct *p) 6974static void normalize_task(struct rq *rq, struct task_struct *p)
6398{ 6975{
6399 const struct sched_class *prev_class = p->sched_class; 6976 const struct sched_class *prev_class = p->sched_class;
6977 struct sched_attr attr = {
6978 .sched_policy = SCHED_NORMAL,
6979 };
6400 int old_prio = p->prio; 6980 int old_prio = p->prio;
6401 int on_rq; 6981 int on_rq;
6402 6982
6403 on_rq = p->on_rq; 6983 on_rq = p->on_rq;
6404 if (on_rq) 6984 if (on_rq)
6405 dequeue_task(rq, p, 0); 6985 dequeue_task(rq, p, 0);
6406 __setscheduler(rq, p, SCHED_NORMAL, 0); 6986 __setscheduler(rq, p, &attr);
6407 if (on_rq) { 6987 if (on_rq) {
6408 enqueue_task(rq, p, 0); 6988 enqueue_task(rq, p, 0);
6409 resched_task(rq->curr); 6989 resched_task(rq->curr);
@@ -6433,7 +7013,7 @@ void normalize_rt_tasks(void)
6433 p->se.statistics.block_start = 0; 7013 p->se.statistics.block_start = 0;
6434#endif 7014#endif
6435 7015
6436 if (!rt_task(p)) { 7016 if (!dl_task(p) && !rt_task(p)) {
6437 /* 7017 /*
6438 * Renice negative nice level userspace 7018 * Renice negative nice level userspace
6439 * tasks back to 0: 7019 * tasks back to 0:
@@ -6628,16 +7208,6 @@ void sched_move_task(struct task_struct *tsk)
6628} 7208}
6629#endif /* CONFIG_CGROUP_SCHED */ 7209#endif /* CONFIG_CGROUP_SCHED */
6630 7210
6631#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6632static unsigned long to_ratio(u64 period, u64 runtime)
6633{
6634 if (runtime == RUNTIME_INF)
6635 return 1ULL << 20;
6636
6637 return div64_u64(runtime << 20, period);
6638}
6639#endif
6640
6641#ifdef CONFIG_RT_GROUP_SCHED 7211#ifdef CONFIG_RT_GROUP_SCHED
6642/* 7212/*
6643 * Ensure that the real time constraints are schedulable. 7213 * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7381,13 @@ static long sched_group_rt_period(struct task_group *tg)
6811 do_div(rt_period_us, NSEC_PER_USEC); 7381 do_div(rt_period_us, NSEC_PER_USEC);
6812 return rt_period_us; 7382 return rt_period_us;
6813} 7383}
7384#endif /* CONFIG_RT_GROUP_SCHED */
6814 7385
7386#ifdef CONFIG_RT_GROUP_SCHED
6815static int sched_rt_global_constraints(void) 7387static int sched_rt_global_constraints(void)
6816{ 7388{
6817 u64 runtime, period;
6818 int ret = 0; 7389 int ret = 0;
6819 7390
6820 if (sysctl_sched_rt_period <= 0)
6821 return -EINVAL;
6822
6823 runtime = global_rt_runtime();
6824 period = global_rt_period();
6825
6826 /*
6827 * Sanity check on the sysctl variables.
6828 */
6829 if (runtime > period && runtime != RUNTIME_INF)
6830 return -EINVAL;
6831
6832 mutex_lock(&rt_constraints_mutex); 7391 mutex_lock(&rt_constraints_mutex);
6833 read_lock(&tasklist_lock); 7392 read_lock(&tasklist_lock);
6834 ret = __rt_schedulable(NULL, 0, 0); 7393 ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7410,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6851static int sched_rt_global_constraints(void) 7410static int sched_rt_global_constraints(void)
6852{ 7411{
6853 unsigned long flags; 7412 unsigned long flags;
6854 int i; 7413 int i, ret = 0;
6855
6856 if (sysctl_sched_rt_period <= 0)
6857 return -EINVAL;
6858
6859 /*
6860 * There's always some RT tasks in the root group
6861 * -- migration, kstopmachine etc..
6862 */
6863 if (sysctl_sched_rt_runtime == 0)
6864 return -EBUSY;
6865 7414
6866 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7415 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6867 for_each_possible_cpu(i) { 7416 for_each_possible_cpu(i) {
@@ -6873,36 +7422,91 @@ static int sched_rt_global_constraints(void)
6873 } 7422 }
6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7423 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6875 7424
6876 return 0; 7425 return ret;
6877} 7426}
6878#endif /* CONFIG_RT_GROUP_SCHED */ 7427#endif /* CONFIG_RT_GROUP_SCHED */
6879 7428
6880int sched_rr_handler(struct ctl_table *table, int write, 7429static int sched_dl_global_constraints(void)
6881 void __user *buffer, size_t *lenp,
6882 loff_t *ppos)
6883{ 7430{
6884 int ret; 7431 u64 runtime = global_rt_runtime();
6885 static DEFINE_MUTEX(mutex); 7432 u64 period = global_rt_period();
7433 u64 new_bw = to_ratio(period, runtime);
7434 int cpu, ret = 0;
7435 unsigned long flags;
6886 7436
6887 mutex_lock(&mutex); 7437 /*
6888 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7438 * Here we want to check the bandwidth not being set to some
6889 /* make sure that internally we keep jiffies */ 7439 * value smaller than the currently allocated bandwidth in
6890 /* also, writing zero resets timeslice to default */ 7440 * any of the root_domains.
6891 if (!ret && write) { 7441 *
6892 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7442 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7443 * cycling on root_domains... Discussion on different/better
7444 * solutions is welcome!
7445 */
7446 for_each_possible_cpu(cpu) {
7447 struct dl_bw *dl_b = dl_bw_of(cpu);
7448
7449 raw_spin_lock_irqsave(&dl_b->lock, flags);
7450 if (new_bw < dl_b->total_bw)
7451 ret = -EBUSY;
7452 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7453
7454 if (ret)
7455 break;
6894 } 7456 }
6895 mutex_unlock(&mutex); 7457
6896 return ret; 7458 return ret;
6897} 7459}
6898 7460
7461static void sched_dl_do_global(void)
7462{
7463 u64 new_bw = -1;
7464 int cpu;
7465 unsigned long flags;
7466
7467 def_dl_bandwidth.dl_period = global_rt_period();
7468 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7469
7470 if (global_rt_runtime() != RUNTIME_INF)
7471 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7472
7473 /*
7474 * FIXME: As above...
7475 */
7476 for_each_possible_cpu(cpu) {
7477 struct dl_bw *dl_b = dl_bw_of(cpu);
7478
7479 raw_spin_lock_irqsave(&dl_b->lock, flags);
7480 dl_b->bw = new_bw;
7481 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7482 }
7483}
7484
7485static int sched_rt_global_validate(void)
7486{
7487 if (sysctl_sched_rt_period <= 0)
7488 return -EINVAL;
7489
7490 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7491 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7492 return -EINVAL;
7493
7494 return 0;
7495}
7496
7497static void sched_rt_do_global(void)
7498{
7499 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7500 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7501}
7502
6899int sched_rt_handler(struct ctl_table *table, int write, 7503int sched_rt_handler(struct ctl_table *table, int write,
6900 void __user *buffer, size_t *lenp, 7504 void __user *buffer, size_t *lenp,
6901 loff_t *ppos) 7505 loff_t *ppos)
6902{ 7506{
6903 int ret;
6904 int old_period, old_runtime; 7507 int old_period, old_runtime;
6905 static DEFINE_MUTEX(mutex); 7508 static DEFINE_MUTEX(mutex);
7509 int ret;
6906 7510
6907 mutex_lock(&mutex); 7511 mutex_lock(&mutex);
6908 old_period = sysctl_sched_rt_period; 7512 old_period = sysctl_sched_rt_period;
@@ -6911,21 +7515,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
6911 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7515 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6912 7516
6913 if (!ret && write) { 7517 if (!ret && write) {
7518 ret = sched_rt_global_validate();
7519 if (ret)
7520 goto undo;
7521
6914 ret = sched_rt_global_constraints(); 7522 ret = sched_rt_global_constraints();
6915 if (ret) { 7523 if (ret)
6916 sysctl_sched_rt_period = old_period; 7524 goto undo;
6917 sysctl_sched_rt_runtime = old_runtime; 7525
6918 } else { 7526 ret = sched_dl_global_constraints();
6919 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7527 if (ret)
6920 def_rt_bandwidth.rt_period = 7528 goto undo;
6921 ns_to_ktime(global_rt_period()); 7529
6922 } 7530 sched_rt_do_global();
7531 sched_dl_do_global();
7532 }
7533 if (0) {
7534undo:
7535 sysctl_sched_rt_period = old_period;
7536 sysctl_sched_rt_runtime = old_runtime;
6923 } 7537 }
6924 mutex_unlock(&mutex); 7538 mutex_unlock(&mutex);
6925 7539
6926 return ret; 7540 return ret;
6927} 7541}
6928 7542
7543int sched_rr_handler(struct ctl_table *table, int write,
7544 void __user *buffer, size_t *lenp,
7545 loff_t *ppos)
7546{
7547 int ret;
7548 static DEFINE_MUTEX(mutex);
7549
7550 mutex_lock(&mutex);
7551 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7552 /* make sure that internally we keep jiffies */
7553 /* also, writing zero resets timeslice to default */
7554 if (!ret && write) {
7555 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7556 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7557 }
7558 mutex_unlock(&mutex);
7559 return ret;
7560}
7561
6929#ifdef CONFIG_CGROUP_SCHED 7562#ifdef CONFIG_CGROUP_SCHED
6930 7563
6931static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7564static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7258,15 +7891,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7258 return ret; 7891 return ret;
7259} 7892}
7260 7893
7261static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7894static int cpu_stats_show(struct seq_file *sf, void *v)
7262 struct cgroup_map_cb *cb)
7263{ 7895{
7264 struct task_group *tg = css_tg(css); 7896 struct task_group *tg = css_tg(seq_css(sf));
7265 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7897 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7266 7898
7267 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7899 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7268 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7900 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7269 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7901 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7270 7902
7271 return 0; 7903 return 0;
7272} 7904}
@@ -7320,7 +7952,7 @@ static struct cftype cpu_files[] = {
7320 }, 7952 },
7321 { 7953 {
7322 .name = "stat", 7954 .name = "stat",
7323 .read_map = cpu_stats_show, 7955 .seq_show = cpu_stats_show,
7324 }, 7956 },
7325#endif 7957#endif
7326#ifdef CONFIG_RT_GROUP_SCHED 7958#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
163 return err; 163 return err;
164} 164}
165 165
166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, 166static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
167 struct cftype *cft, struct seq_file *m)
168{ 167{
169 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(seq_css(m));
170 u64 percpu; 169 u64 percpu;
171 int i; 170 int i;
172 171
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
183 [CPUACCT_STAT_SYSTEM] = "system", 182 [CPUACCT_STAT_SYSTEM] = "system",
184}; 183};
185 184
186static int cpuacct_stats_show(struct cgroup_subsys_state *css, 185static int cpuacct_stats_show(struct seq_file *sf, void *v)
187 struct cftype *cft, struct cgroup_map_cb *cb)
188{ 186{
189 struct cpuacct *ca = css_ca(css); 187 struct cpuacct *ca = css_ca(seq_css(sf));
190 int cpu; 188 int cpu;
191 s64 val = 0; 189 s64 val = 0;
192 190
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 val += kcpustat->cpustat[CPUTIME_NICE]; 194 val += kcpustat->cpustat[CPUTIME_NICE];
197 } 195 }
198 val = cputime64_to_clock_t(val); 196 val = cputime64_to_clock_t(val);
199 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 197 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
200 198
201 val = 0; 199 val = 0;
202 for_each_online_cpu(cpu) { 200 for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
207 } 205 }
208 206
209 val = cputime64_to_clock_t(val); 207 val = cputime64_to_clock_t(val);
210 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 208 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
211 209
212 return 0; 210 return 0;
213} 211}
@@ -220,11 +218,11 @@ static struct cftype files[] = {
220 }, 218 },
221 { 219 {
222 .name = "usage_percpu", 220 .name = "usage_percpu",
223 .read_seq_string = cpuacct_percpu_seq_read, 221 .seq_show = cpuacct_percpu_seq_show,
224 }, 222 },
225 { 223 {
226 .name = "stat", 224 .name = "stat",
227 .read_map = cpuacct_stats_show, 225 .seq_show = cpuacct_stats_show,
228 }, 226 },
229 { } /* terminate */ 227 { } /* terminate */
230}; 228};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..5b9bb42b2d47
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
1/*
2 * kernel/sched/cpudl.c
3 *
4 * Global CPU deadline management
5 *
6 * Author: Juri Lelli <j.lelli@sssup.it>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2
11 * of the License.
12 */
13
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include "cpudeadline.h"
17
18static inline int parent(int i)
19{
20 return (i - 1) >> 1;
21}
22
23static inline int left_child(int i)
24{
25 return (i << 1) + 1;
26}
27
28static inline int right_child(int i)
29{
30 return (i << 1) + 2;
31}
32
33static inline int dl_time_before(u64 a, u64 b)
34{
35 return (s64)(a - b) < 0;
36}
37
38static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41
42 swap(cp->elements[a], cp->elements[b]);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
44}
45
46static void cpudl_heapify(struct cpudl *cp, int idx)
47{
48 int l, r, largest;
49
50 /* adapted from lib/prio_heap.c */
51 while(1) {
52 l = left_child(idx);
53 r = right_child(idx);
54 largest = idx;
55
56 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
57 cp->elements[l].dl))
58 largest = l;
59 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
60 cp->elements[r].dl))
61 largest = r;
62 if (largest == idx)
63 break;
64
65 /* Push idx down the heap one level and bump one up */
66 cpudl_exchange(cp, largest, idx);
67 idx = largest;
68 }
69}
70
71static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72{
73 WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
74
75 if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76 cp->elements[idx].dl = new_dl;
77 cpudl_heapify(cp, idx);
78 } else {
79 cp->elements[idx].dl = new_dl;
80 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
81 cp->elements[idx].dl)) {
82 cpudl_exchange(cp, idx, parent(idx));
83 idx = parent(idx);
84 }
85 }
86}
87
88static inline int cpudl_maximum(struct cpudl *cp)
89{
90 return cp->elements[0].cpu;
91}
92
93/*
94 * cpudl_find - find the best (later-dl) CPU in the system
95 * @cp: the cpudl max-heap context
96 * @p: the task
97 * @later_mask: a mask to fill in with the selected CPUs (or NULL)
98 *
99 * Returns: int - best CPU (heap maximum if suitable)
100 */
101int cpudl_find(struct cpudl *cp, struct task_struct *p,
102 struct cpumask *later_mask)
103{
104 int best_cpu = -1;
105 const struct sched_dl_entity *dl_se = &p->dl;
106
107 if (later_mask && cpumask_and(later_mask, cp->free_cpus,
108 &p->cpus_allowed) && cpumask_and(later_mask,
109 later_mask, cpu_active_mask)) {
110 best_cpu = cpumask_any(later_mask);
111 goto out;
112 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
113 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
114 best_cpu = cpudl_maximum(cp);
115 if (later_mask)
116 cpumask_set_cpu(best_cpu, later_mask);
117 }
118
119out:
120 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
121
122 return best_cpu;
123}
124
125/*
126 * cpudl_set - update the cpudl max-heap
127 * @cp: the cpudl max-heap context
128 * @cpu: the target cpu
129 * @dl: the new earliest deadline for this cpu
130 *
131 * Notes: assumes cpu_rq(cpu)->lock is locked
132 *
133 * Returns: (void)
134 */
135void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
136{
137 int old_idx, new_cpu;
138 unsigned long flags;
139
140 WARN_ON(!cpu_present(cpu));
141
142 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu];
144 if (!is_valid) {
145 /* remove item */
146 if (old_idx == IDX_INVALID) {
147 /*
148 * Nothing to remove if old_idx was invalid.
149 * This could happen if a rq_offline_dl is
150 * called for a CPU without -dl tasks running.
151 */
152 goto out;
153 }
154 new_cpu = cp->elements[cp->size - 1].cpu;
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) {
163 cpudl_exchange(cp, old_idx, parent(old_idx));
164 old_idx = parent(old_idx);
165 }
166 cpumask_set_cpu(cpu, cp->free_cpus);
167 cpudl_heapify(cp, old_idx);
168
169 goto out;
170 }
171
172 if (old_idx == IDX_INVALID) {
173 cp->size++;
174 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else {
180 cpudl_change_key(cp, old_idx, dl);
181 }
182
183out:
184 raw_spin_unlock_irqrestore(&cp->lock, flags);
185}
186
187/*
188 * cpudl_init - initialize the cpudl structure
189 * @cp: the cpudl max-heap context
190 */
191int cpudl_init(struct cpudl *cp)
192{
193 int i;
194
195 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock);
197 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++)
199 cp->cpu_to_idx[i] = IDX_INVALID;
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
201 return -ENOMEM;
202 cpumask_setall(cp->free_cpus);
203
204 return 0;
205}
206
207/*
208 * cpudl_cleanup - clean up the cpudl structure
209 * @cp: the cpudl max-heap context
210 */
211void cpudl_cleanup(struct cpudl *cp)
212{
213 /*
214 * nothing to do for the moment
215 */
216}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_CPUDL_H
2#define _LINUX_CPUDL_H
3
4#include <linux/sched.h>
5
6#define IDX_INVALID -1
7
8struct array_item {
9 u64 dl;
10 int cpu;
11};
12
13struct cpudl {
14 raw_spinlock_t lock;
15 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus;
19};
20
21
22#ifdef CONFIG_SMP
23int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */
32
33#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..6e79b3faa4cd
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1639 @@
1/*
2 * Deadline Scheduling Class (SCHED_DEADLINE)
3 *
4 * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
5 *
6 * Tasks that periodically executes their instances for less than their
7 * runtime won't miss any of their deadlines.
8 * Tasks that are not periodic or sporadic or that tries to execute more
9 * than their reserved bandwidth will be slowed down (and may potentially
10 * miss some of their deadlines), and won't affect any other task.
11 *
12 * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
13 * Juri Lelli <juri.lelli@gmail.com>,
14 * Michael Trimarchi <michael@amarulasolutions.com>,
15 * Fabio Checconi <fchecconi@gmail.com>
16 */
17#include "sched.h"
18
19#include <linux/slab.h>
20
21struct dl_bandwidth def_dl_bandwidth;
22
23static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
24{
25 return container_of(dl_se, struct task_struct, dl);
26}
27
28static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
29{
30 return container_of(dl_rq, struct rq, dl);
31}
32
33static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
34{
35 struct task_struct *p = dl_task_of(dl_se);
36 struct rq *rq = task_rq(p);
37
38 return &rq->dl;
39}
40
41static inline int on_dl_rq(struct sched_dl_entity *dl_se)
42{
43 return !RB_EMPTY_NODE(&dl_se->rb_node);
44}
45
46static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
47{
48 struct sched_dl_entity *dl_se = &p->dl;
49
50 return dl_rq->rb_leftmost == &dl_se->rb_node;
51}
52
53void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
54{
55 raw_spin_lock_init(&dl_b->dl_runtime_lock);
56 dl_b->dl_period = period;
57 dl_b->dl_runtime = runtime;
58}
59
60extern unsigned long to_ratio(u64 period, u64 runtime);
61
62void init_dl_bw(struct dl_bw *dl_b)
63{
64 raw_spin_lock_init(&dl_b->lock);
65 raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
66 if (global_rt_runtime() == RUNTIME_INF)
67 dl_b->bw = -1;
68 else
69 dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
70 raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
71 dl_b->total_bw = 0;
72}
73
74void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
75{
76 dl_rq->rb_root = RB_ROOT;
77
78#ifdef CONFIG_SMP
79 /* zero means no -deadline tasks */
80 dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
81
82 dl_rq->dl_nr_migratory = 0;
83 dl_rq->overloaded = 0;
84 dl_rq->pushable_dl_tasks_root = RB_ROOT;
85#else
86 init_dl_bw(&dl_rq->dl_bw);
87#endif
88}
89
90#ifdef CONFIG_SMP
91
92static inline int dl_overloaded(struct rq *rq)
93{
94 return atomic_read(&rq->rd->dlo_count);
95}
96
97static inline void dl_set_overload(struct rq *rq)
98{
99 if (!rq->online)
100 return;
101
102 cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
103 /*
104 * Must be visible before the overload count is
105 * set (as in sched_rt.c).
106 *
107 * Matched by the barrier in pull_dl_task().
108 */
109 smp_wmb();
110 atomic_inc(&rq->rd->dlo_count);
111}
112
113static inline void dl_clear_overload(struct rq *rq)
114{
115 if (!rq->online)
116 return;
117
118 atomic_dec(&rq->rd->dlo_count);
119 cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
120}
121
122static void update_dl_migration(struct dl_rq *dl_rq)
123{
124 if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
125 if (!dl_rq->overloaded) {
126 dl_set_overload(rq_of_dl_rq(dl_rq));
127 dl_rq->overloaded = 1;
128 }
129 } else if (dl_rq->overloaded) {
130 dl_clear_overload(rq_of_dl_rq(dl_rq));
131 dl_rq->overloaded = 0;
132 }
133}
134
135static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
136{
137 struct task_struct *p = dl_task_of(dl_se);
138
139 if (p->nr_cpus_allowed > 1)
140 dl_rq->dl_nr_migratory++;
141
142 update_dl_migration(dl_rq);
143}
144
145static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
146{
147 struct task_struct *p = dl_task_of(dl_se);
148
149 if (p->nr_cpus_allowed > 1)
150 dl_rq->dl_nr_migratory--;
151
152 update_dl_migration(dl_rq);
153}
154
155/*
156 * The list of pushable -deadline task is not a plist, like in
157 * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
158 */
159static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
160{
161 struct dl_rq *dl_rq = &rq->dl;
162 struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
163 struct rb_node *parent = NULL;
164 struct task_struct *entry;
165 int leftmost = 1;
166
167 BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
168
169 while (*link) {
170 parent = *link;
171 entry = rb_entry(parent, struct task_struct,
172 pushable_dl_tasks);
173 if (dl_entity_preempt(&p->dl, &entry->dl))
174 link = &parent->rb_left;
175 else {
176 link = &parent->rb_right;
177 leftmost = 0;
178 }
179 }
180
181 if (leftmost)
182 dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
183
184 rb_link_node(&p->pushable_dl_tasks, parent, link);
185 rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
186}
187
188static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
189{
190 struct dl_rq *dl_rq = &rq->dl;
191
192 if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
193 return;
194
195 if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
196 struct rb_node *next_node;
197
198 next_node = rb_next(&p->pushable_dl_tasks);
199 dl_rq->pushable_dl_tasks_leftmost = next_node;
200 }
201
202 rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
203 RB_CLEAR_NODE(&p->pushable_dl_tasks);
204}
205
206static inline int has_pushable_dl_tasks(struct rq *rq)
207{
208 return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
209}
210
211static int push_dl_task(struct rq *rq);
212
213#else
214
215static inline
216void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
217{
218}
219
220static inline
221void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
222{
223}
224
225static inline
226void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
227{
228}
229
230static inline
231void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
232{
233}
234
235#endif /* CONFIG_SMP */
236
237static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
238static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
239static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
240 int flags);
241
242/*
243 * We are being explicitly informed that a new instance is starting,
244 * and this means that:
245 * - the absolute deadline of the entity has to be placed at
246 * current time + relative deadline;
247 * - the runtime of the entity has to be set to the maximum value.
248 *
249 * The capability of specifying such event is useful whenever a -deadline
250 * entity wants to (try to!) synchronize its behaviour with the scheduler's
251 * one, and to (try to!) reconcile itself with its own scheduling
252 * parameters.
253 */
254static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
255 struct sched_dl_entity *pi_se)
256{
257 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
258 struct rq *rq = rq_of_dl_rq(dl_rq);
259
260 WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
261
262 /*
263 * We use the regular wall clock time to set deadlines in the
264 * future; in fact, we must consider execution overheads (time
265 * spent on hardirq context, etc.).
266 */
267 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
268 dl_se->runtime = pi_se->dl_runtime;
269 dl_se->dl_new = 0;
270}
271
272/*
273 * Pure Earliest Deadline First (EDF) scheduling does not deal with the
274 * possibility of a entity lasting more than what it declared, and thus
275 * exhausting its runtime.
276 *
277 * Here we are interested in making runtime overrun possible, but we do
278 * not want a entity which is misbehaving to affect the scheduling of all
279 * other entities.
280 * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
281 * is used, in order to confine each entity within its own bandwidth.
282 *
283 * This function deals exactly with that, and ensures that when the runtime
284 * of a entity is replenished, its deadline is also postponed. That ensures
285 * the overrunning entity can't interfere with other entity in the system and
286 * can't make them miss their deadlines. Reasons why this kind of overruns
287 * could happen are, typically, a entity voluntarily trying to overcome its
288 * runtime, or it just underestimated it during sched_setscheduler_ex().
289 */
290static void replenish_dl_entity(struct sched_dl_entity *dl_se,
291 struct sched_dl_entity *pi_se)
292{
293 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
294 struct rq *rq = rq_of_dl_rq(dl_rq);
295
296 BUG_ON(pi_se->dl_runtime <= 0);
297
298 /*
299 * This could be the case for a !-dl task that is boosted.
300 * Just go with full inherited parameters.
301 */
302 if (dl_se->dl_deadline == 0) {
303 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
304 dl_se->runtime = pi_se->dl_runtime;
305 }
306
307 /*
308 * We keep moving the deadline away until we get some
309 * available runtime for the entity. This ensures correct
310 * handling of situations where the runtime overrun is
311 * arbitrary large.
312 */
313 while (dl_se->runtime <= 0) {
314 dl_se->deadline += pi_se->dl_period;
315 dl_se->runtime += pi_se->dl_runtime;
316 }
317
318 /*
319 * At this point, the deadline really should be "in
320 * the future" with respect to rq->clock. If it's
321 * not, we are, for some reason, lagging too much!
322 * Anyway, after having warn userspace abut that,
323 * we still try to keep the things running by
324 * resetting the deadline and the budget of the
325 * entity.
326 */
327 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
328 static bool lag_once = false;
329
330 if (!lag_once) {
331 lag_once = true;
332 printk_sched("sched: DL replenish lagged to much\n");
333 }
334 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
335 dl_se->runtime = pi_se->dl_runtime;
336 }
337}
338
339/*
340 * Here we check if --at time t-- an entity (which is probably being
341 * [re]activated or, in general, enqueued) can use its remaining runtime
342 * and its current deadline _without_ exceeding the bandwidth it is
343 * assigned (function returns true if it can't). We are in fact applying
344 * one of the CBS rules: when a task wakes up, if the residual runtime
345 * over residual deadline fits within the allocated bandwidth, then we
346 * can keep the current (absolute) deadline and residual budget without
347 * disrupting the schedulability of the system. Otherwise, we should
348 * refill the runtime and set the deadline a period in the future,
349 * because keeping the current (absolute) deadline of the task would
350 * result in breaking guarantees promised to other tasks (refer to
351 * Documentation/scheduler/sched-deadline.txt for more informations).
352 *
353 * This function returns true if:
354 *
355 * runtime / (deadline - t) > dl_runtime / dl_period ,
356 *
357 * IOW we can't recycle current parameters.
358 *
359 * Notice that the bandwidth check is done against the period. For
360 * task with deadline equal to period this is the same of using
361 * dl_deadline instead of dl_period in the equation above.
362 */
363static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
364 struct sched_dl_entity *pi_se, u64 t)
365{
366 u64 left, right;
367
368 /*
369 * left and right are the two sides of the equation above,
370 * after a bit of shuffling to use multiplications instead
371 * of divisions.
372 *
373 * Note that none of the time values involved in the two
374 * multiplications are absolute: dl_deadline and dl_runtime
375 * are the relative deadline and the maximum runtime of each
376 * instance, runtime is the runtime left for the last instance
377 * and (deadline - t), since t is rq->clock, is the time left
378 * to the (absolute) deadline. Even if overflowing the u64 type
379 * is very unlikely to occur in both cases, here we scale down
380 * as we want to avoid that risk at all. Scaling down by 10
381 * means that we reduce granularity to 1us. We are fine with it,
382 * since this is only a true/false check and, anyway, thinking
383 * of anything below microseconds resolution is actually fiction
384 * (but still we want to give the user that illusion >;).
385 */
386 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
387 right = ((dl_se->deadline - t) >> DL_SCALE) *
388 (pi_se->dl_runtime >> DL_SCALE);
389
390 return dl_time_before(right, left);
391}
392
393/*
394 * When a -deadline entity is queued back on the runqueue, its runtime and
395 * deadline might need updating.
396 *
397 * The policy here is that we update the deadline of the entity only if:
398 * - the current deadline is in the past,
399 * - using the remaining runtime with the current deadline would make
400 * the entity exceed its bandwidth.
401 */
402static void update_dl_entity(struct sched_dl_entity *dl_se,
403 struct sched_dl_entity *pi_se)
404{
405 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
406 struct rq *rq = rq_of_dl_rq(dl_rq);
407
408 /*
409 * The arrival of a new instance needs special treatment, i.e.,
410 * the actual scheduling parameters have to be "renewed".
411 */
412 if (dl_se->dl_new) {
413 setup_new_dl_entity(dl_se, pi_se);
414 return;
415 }
416
417 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
418 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
419 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
420 dl_se->runtime = pi_se->dl_runtime;
421 }
422}
423
424/*
425 * If the entity depleted all its runtime, and if we want it to sleep
426 * while waiting for some new execution time to become available, we
427 * set the bandwidth enforcement timer to the replenishment instant
428 * and try to activate it.
429 *
430 * Notice that it is important for the caller to know if the timer
431 * actually started or not (i.e., the replenishment instant is in
432 * the future or in the past).
433 */
434static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
435{
436 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
437 struct rq *rq = rq_of_dl_rq(dl_rq);
438 ktime_t now, act;
439 ktime_t soft, hard;
440 unsigned long range;
441 s64 delta;
442
443 if (boosted)
444 return 0;
445 /*
446 * We want the timer to fire at the deadline, but considering
447 * that it is actually coming from rq->clock and not from
448 * hrtimer's time base reading.
449 */
450 act = ns_to_ktime(dl_se->deadline);
451 now = hrtimer_cb_get_time(&dl_se->dl_timer);
452 delta = ktime_to_ns(now) - rq_clock(rq);
453 act = ktime_add_ns(act, delta);
454
455 /*
456 * If the expiry time already passed, e.g., because the value
457 * chosen as the deadline is too small, don't even try to
458 * start the timer in the past!
459 */
460 if (ktime_us_delta(act, now) < 0)
461 return 0;
462
463 hrtimer_set_expires(&dl_se->dl_timer, act);
464
465 soft = hrtimer_get_softexpires(&dl_se->dl_timer);
466 hard = hrtimer_get_expires(&dl_se->dl_timer);
467 range = ktime_to_ns(ktime_sub(hard, soft));
468 __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
469 range, HRTIMER_MODE_ABS, 0);
470
471 return hrtimer_active(&dl_se->dl_timer);
472}
473
474/*
475 * This is the bandwidth enforcement timer callback. If here, we know
476 * a task is not on its dl_rq, since the fact that the timer was running
477 * means the task is throttled and needs a runtime replenishment.
478 *
479 * However, what we actually do depends on the fact the task is active,
480 * (it is on its rq) or has been removed from there by a call to
481 * dequeue_task_dl(). In the former case we must issue the runtime
482 * replenishment and add the task back to the dl_rq; in the latter, we just
483 * do nothing but clearing dl_throttled, so that runtime and deadline
484 * updating (and the queueing back to dl_rq) will be done by the
485 * next call to enqueue_task_dl().
486 */
487static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
488{
489 struct sched_dl_entity *dl_se = container_of(timer,
490 struct sched_dl_entity,
491 dl_timer);
492 struct task_struct *p = dl_task_of(dl_se);
493 struct rq *rq = task_rq(p);
494 raw_spin_lock(&rq->lock);
495
496 /*
497 * We need to take care of a possible races here. In fact, the
498 * task might have changed its scheduling policy to something
499 * different from SCHED_DEADLINE or changed its reservation
500 * parameters (through sched_setscheduler()).
501 */
502 if (!dl_task(p) || dl_se->dl_new)
503 goto unlock;
504
505 sched_clock_tick();
506 update_rq_clock(rq);
507 dl_se->dl_throttled = 0;
508 if (p->on_rq) {
509 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
510 if (task_has_dl_policy(rq->curr))
511 check_preempt_curr_dl(rq, p, 0);
512 else
513 resched_task(rq->curr);
514#ifdef CONFIG_SMP
515 /*
516 * Queueing this task back might have overloaded rq,
517 * check if we need to kick someone away.
518 */
519 if (has_pushable_dl_tasks(rq))
520 push_dl_task(rq);
521#endif
522 }
523unlock:
524 raw_spin_unlock(&rq->lock);
525
526 return HRTIMER_NORESTART;
527}
528
529void init_dl_task_timer(struct sched_dl_entity *dl_se)
530{
531 struct hrtimer *timer = &dl_se->dl_timer;
532
533 if (hrtimer_active(timer)) {
534 hrtimer_try_to_cancel(timer);
535 return;
536 }
537
538 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
539 timer->function = dl_task_timer;
540}
541
542static
543int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
544{
545 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
546 int rorun = dl_se->runtime <= 0;
547
548 if (!rorun && !dmiss)
549 return 0;
550
551 /*
552 * If we are beyond our current deadline and we are still
553 * executing, then we have already used some of the runtime of
554 * the next instance. Thus, if we do not account that, we are
555 * stealing bandwidth from the system at each deadline miss!
556 */
557 if (dmiss) {
558 dl_se->runtime = rorun ? dl_se->runtime : 0;
559 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
560 }
561
562 return 1;
563}
564
565extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
566
567/*
568 * Update the current task's runtime statistics (provided it is still
569 * a -deadline task and has not been removed from the dl_rq).
570 */
571static void update_curr_dl(struct rq *rq)
572{
573 struct task_struct *curr = rq->curr;
574 struct sched_dl_entity *dl_se = &curr->dl;
575 u64 delta_exec;
576
577 if (!dl_task(curr) || !on_dl_rq(dl_se))
578 return;
579
580 /*
581 * Consumed budget is computed considering the time as
582 * observed by schedulable tasks (excluding time spent
583 * in hardirq context, etc.). Deadlines are instead
584 * computed using hard walltime. This seems to be the more
585 * natural solution, but the full ramifications of this
586 * approach need further study.
587 */
588 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
589 if (unlikely((s64)delta_exec < 0))
590 delta_exec = 0;
591
592 schedstat_set(curr->se.statistics.exec_max,
593 max(curr->se.statistics.exec_max, delta_exec));
594
595 curr->se.sum_exec_runtime += delta_exec;
596 account_group_exec_runtime(curr, delta_exec);
597
598 curr->se.exec_start = rq_clock_task(rq);
599 cpuacct_charge(curr, delta_exec);
600
601 sched_rt_avg_update(rq, delta_exec);
602
603 dl_se->runtime -= delta_exec;
604 if (dl_runtime_exceeded(rq, dl_se)) {
605 __dequeue_task_dl(rq, curr, 0);
606 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
607 dl_se->dl_throttled = 1;
608 else
609 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
610
611 if (!is_leftmost(curr, &rq->dl))
612 resched_task(curr);
613 }
614
615 /*
616 * Because -- for now -- we share the rt bandwidth, we need to
617 * account our runtime there too, otherwise actual rt tasks
618 * would be able to exceed the shared quota.
619 *
620 * Account to the root rt group for now.
621 *
622 * The solution we're working towards is having the RT groups scheduled
623 * using deadline servers -- however there's a few nasties to figure
624 * out before that can happen.
625 */
626 if (rt_bandwidth_enabled()) {
627 struct rt_rq *rt_rq = &rq->rt;
628
629 raw_spin_lock(&rt_rq->rt_runtime_lock);
630 /*
631 * We'll let actual RT tasks worry about the overflow here, we
632 * have our own CBS to keep us inline; only account when RT
633 * bandwidth is relevant.
634 */
635 if (sched_rt_bandwidth_account(rt_rq))
636 rt_rq->rt_time += delta_exec;
637 raw_spin_unlock(&rt_rq->rt_runtime_lock);
638 }
639}
640
641#ifdef CONFIG_SMP
642
643static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
644
645static inline u64 next_deadline(struct rq *rq)
646{
647 struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
648
649 if (next && dl_prio(next->prio))
650 return next->dl.deadline;
651 else
652 return 0;
653}
654
655static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
656{
657 struct rq *rq = rq_of_dl_rq(dl_rq);
658
659 if (dl_rq->earliest_dl.curr == 0 ||
660 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
661 /*
662 * If the dl_rq had no -deadline tasks, or if the new task
663 * has shorter deadline than the current one on dl_rq, we
664 * know that the previous earliest becomes our next earliest,
665 * as the new task becomes the earliest itself.
666 */
667 dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
668 dl_rq->earliest_dl.curr = deadline;
669 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
670 } else if (dl_rq->earliest_dl.next == 0 ||
671 dl_time_before(deadline, dl_rq->earliest_dl.next)) {
672 /*
673 * On the other hand, if the new -deadline task has a
674 * a later deadline than the earliest one on dl_rq, but
675 * it is earlier than the next (if any), we must
676 * recompute the next-earliest.
677 */
678 dl_rq->earliest_dl.next = next_deadline(rq);
679 }
680}
681
682static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
683{
684 struct rq *rq = rq_of_dl_rq(dl_rq);
685
686 /*
687 * Since we may have removed our earliest (and/or next earliest)
688 * task we must recompute them.
689 */
690 if (!dl_rq->dl_nr_running) {
691 dl_rq->earliest_dl.curr = 0;
692 dl_rq->earliest_dl.next = 0;
693 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
694 } else {
695 struct rb_node *leftmost = dl_rq->rb_leftmost;
696 struct sched_dl_entity *entry;
697
698 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
699 dl_rq->earliest_dl.curr = entry->deadline;
700 dl_rq->earliest_dl.next = next_deadline(rq);
701 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
702 }
703}
704
705#else
706
707static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
708static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
709
710#endif /* CONFIG_SMP */
711
712static inline
713void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
714{
715 int prio = dl_task_of(dl_se)->prio;
716 u64 deadline = dl_se->deadline;
717
718 WARN_ON(!dl_prio(prio));
719 dl_rq->dl_nr_running++;
720 inc_nr_running(rq_of_dl_rq(dl_rq));
721
722 inc_dl_deadline(dl_rq, deadline);
723 inc_dl_migration(dl_se, dl_rq);
724}
725
726static inline
727void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
728{
729 int prio = dl_task_of(dl_se)->prio;
730
731 WARN_ON(!dl_prio(prio));
732 WARN_ON(!dl_rq->dl_nr_running);
733 dl_rq->dl_nr_running--;
734 dec_nr_running(rq_of_dl_rq(dl_rq));
735
736 dec_dl_deadline(dl_rq, dl_se->deadline);
737 dec_dl_migration(dl_se, dl_rq);
738}
739
740static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
741{
742 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
743 struct rb_node **link = &dl_rq->rb_root.rb_node;
744 struct rb_node *parent = NULL;
745 struct sched_dl_entity *entry;
746 int leftmost = 1;
747
748 BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
749
750 while (*link) {
751 parent = *link;
752 entry = rb_entry(parent, struct sched_dl_entity, rb_node);
753 if (dl_time_before(dl_se->deadline, entry->deadline))
754 link = &parent->rb_left;
755 else {
756 link = &parent->rb_right;
757 leftmost = 0;
758 }
759 }
760
761 if (leftmost)
762 dl_rq->rb_leftmost = &dl_se->rb_node;
763
764 rb_link_node(&dl_se->rb_node, parent, link);
765 rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
766
767 inc_dl_tasks(dl_se, dl_rq);
768}
769
770static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
771{
772 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
773
774 if (RB_EMPTY_NODE(&dl_se->rb_node))
775 return;
776
777 if (dl_rq->rb_leftmost == &dl_se->rb_node) {
778 struct rb_node *next_node;
779
780 next_node = rb_next(&dl_se->rb_node);
781 dl_rq->rb_leftmost = next_node;
782 }
783
784 rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
785 RB_CLEAR_NODE(&dl_se->rb_node);
786
787 dec_dl_tasks(dl_se, dl_rq);
788}
789
790static void
791enqueue_dl_entity(struct sched_dl_entity *dl_se,
792 struct sched_dl_entity *pi_se, int flags)
793{
794 BUG_ON(on_dl_rq(dl_se));
795
796 /*
797 * If this is a wakeup or a new instance, the scheduling
798 * parameters of the task might need updating. Otherwise,
799 * we want a replenishment of its runtime.
800 */
801 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
802 replenish_dl_entity(dl_se, pi_se);
803 else
804 update_dl_entity(dl_se, pi_se);
805
806 __enqueue_dl_entity(dl_se);
807}
808
809static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
810{
811 __dequeue_dl_entity(dl_se);
812}
813
814static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
815{
816 struct task_struct *pi_task = rt_mutex_get_top_task(p);
817 struct sched_dl_entity *pi_se = &p->dl;
818
819 /*
820 * Use the scheduling parameters of the top pi-waiter
821 * task if we have one and its (relative) deadline is
822 * smaller than our one... OTW we keep our runtime and
823 * deadline.
824 */
825 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
826 pi_se = &pi_task->dl;
827
828 /*
829 * If p is throttled, we do nothing. In fact, if it exhausted
830 * its budget it needs a replenishment and, since it now is on
831 * its rq, the bandwidth timer callback (which clearly has not
832 * run yet) will take care of this.
833 */
834 if (p->dl.dl_throttled)
835 return;
836
837 enqueue_dl_entity(&p->dl, pi_se, flags);
838
839 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
840 enqueue_pushable_dl_task(rq, p);
841}
842
843static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
844{
845 dequeue_dl_entity(&p->dl);
846 dequeue_pushable_dl_task(rq, p);
847}
848
849static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
850{
851 update_curr_dl(rq);
852 __dequeue_task_dl(rq, p, flags);
853}
854
855/*
856 * Yield task semantic for -deadline tasks is:
857 *
858 * get off from the CPU until our next instance, with
859 * a new runtime. This is of little use now, since we
860 * don't have a bandwidth reclaiming mechanism. Anyway,
861 * bandwidth reclaiming is planned for the future, and
862 * yield_task_dl will indicate that some spare budget
863 * is available for other task instances to use it.
864 */
865static void yield_task_dl(struct rq *rq)
866{
867 struct task_struct *p = rq->curr;
868
869 /*
870 * We make the task go to sleep until its current deadline by
871 * forcing its runtime to zero. This way, update_curr_dl() stops
872 * it and the bandwidth timer will wake it up and will give it
873 * new scheduling parameters (thanks to dl_new=1).
874 */
875 if (p->dl.runtime > 0) {
876 rq->curr->dl.dl_new = 1;
877 p->dl.runtime = 0;
878 }
879 update_curr_dl(rq);
880}
881
882#ifdef CONFIG_SMP
883
884static int find_later_rq(struct task_struct *task);
885
886static int
887select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
888{
889 struct task_struct *curr;
890 struct rq *rq;
891
892 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
893 goto out;
894
895 rq = cpu_rq(cpu);
896
897 rcu_read_lock();
898 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
899
900 /*
901 * If we are dealing with a -deadline task, we must
902 * decide where to wake it up.
903 * If it has a later deadline and the current task
904 * on this rq can't move (provided the waking task
905 * can!) we prefer to send it somewhere else. On the
906 * other hand, if it has a shorter deadline, we
907 * try to make it stay here, it might be important.
908 */
909 if (unlikely(dl_task(curr)) &&
910 (curr->nr_cpus_allowed < 2 ||
911 !dl_entity_preempt(&p->dl, &curr->dl)) &&
912 (p->nr_cpus_allowed > 1)) {
913 int target = find_later_rq(p);
914
915 if (target != -1)
916 cpu = target;
917 }
918 rcu_read_unlock();
919
920out:
921 return cpu;
922}
923
924static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
925{
926 /*
927 * Current can't be migrated, useless to reschedule,
928 * let's hope p can move out.
929 */
930 if (rq->curr->nr_cpus_allowed == 1 ||
931 cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
932 return;
933
934 /*
935 * p is migratable, so let's not schedule it and
936 * see if it is pushed or pulled somewhere else.
937 */
938 if (p->nr_cpus_allowed != 1 &&
939 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
940 return;
941
942 resched_task(rq->curr);
943}
944
945#endif /* CONFIG_SMP */
946
947/*
948 * Only called when both the current and waking task are -deadline
949 * tasks.
950 */
951static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
952 int flags)
953{
954 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
955 resched_task(rq->curr);
956 return;
957 }
958
959#ifdef CONFIG_SMP
960 /*
961 * In the unlikely case current and p have the same deadline
962 * let us try to decide what's the best thing to do...
963 */
964 if ((p->dl.deadline == rq->curr->dl.deadline) &&
965 !test_tsk_need_resched(rq->curr))
966 check_preempt_equal_dl(rq, p);
967#endif /* CONFIG_SMP */
968}
969
970#ifdef CONFIG_SCHED_HRTICK
971static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
972{
973 s64 delta = p->dl.dl_runtime - p->dl.runtime;
974
975 if (delta > 10000)
976 hrtick_start(rq, p->dl.runtime);
977}
978#endif
979
980static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
981 struct dl_rq *dl_rq)
982{
983 struct rb_node *left = dl_rq->rb_leftmost;
984
985 if (!left)
986 return NULL;
987
988 return rb_entry(left, struct sched_dl_entity, rb_node);
989}
990
991struct task_struct *pick_next_task_dl(struct rq *rq)
992{
993 struct sched_dl_entity *dl_se;
994 struct task_struct *p;
995 struct dl_rq *dl_rq;
996
997 dl_rq = &rq->dl;
998
999 if (unlikely(!dl_rq->dl_nr_running))
1000 return NULL;
1001
1002 dl_se = pick_next_dl_entity(rq, dl_rq);
1003 BUG_ON(!dl_se);
1004
1005 p = dl_task_of(dl_se);
1006 p->se.exec_start = rq_clock_task(rq);
1007
1008 /* Running task will never be pushed. */
1009 dequeue_pushable_dl_task(rq, p);
1010
1011#ifdef CONFIG_SCHED_HRTICK
1012 if (hrtick_enabled(rq))
1013 start_hrtick_dl(rq, p);
1014#endif
1015
1016#ifdef CONFIG_SMP
1017 rq->post_schedule = has_pushable_dl_tasks(rq);
1018#endif /* CONFIG_SMP */
1019
1020 return p;
1021}
1022
1023static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1024{
1025 update_curr_dl(rq);
1026
1027 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1028 enqueue_pushable_dl_task(rq, p);
1029}
1030
1031static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1032{
1033 update_curr_dl(rq);
1034
1035#ifdef CONFIG_SCHED_HRTICK
1036 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1037 start_hrtick_dl(rq, p);
1038#endif
1039}
1040
1041static void task_fork_dl(struct task_struct *p)
1042{
1043 /*
1044 * SCHED_DEADLINE tasks cannot fork and this is achieved through
1045 * sched_fork()
1046 */
1047}
1048
1049static void task_dead_dl(struct task_struct *p)
1050{
1051 struct hrtimer *timer = &p->dl.dl_timer;
1052 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1053
1054 /*
1055 * Since we are TASK_DEAD we won't slip out of the domain!
1056 */
1057 raw_spin_lock_irq(&dl_b->lock);
1058 dl_b->total_bw -= p->dl.dl_bw;
1059 raw_spin_unlock_irq(&dl_b->lock);
1060
1061 hrtimer_cancel(timer);
1062}
1063
1064static void set_curr_task_dl(struct rq *rq)
1065{
1066 struct task_struct *p = rq->curr;
1067
1068 p->se.exec_start = rq_clock_task(rq);
1069
1070 /* You can't push away the running task */
1071 dequeue_pushable_dl_task(rq, p);
1072}
1073
1074#ifdef CONFIG_SMP
1075
1076/* Only try algorithms three times */
1077#define DL_MAX_TRIES 3
1078
1079static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1080{
1081 if (!task_running(rq, p) &&
1082 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1083 (p->nr_cpus_allowed > 1))
1084 return 1;
1085
1086 return 0;
1087}
1088
1089/* Returns the second earliest -deadline task, NULL otherwise */
1090static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
1091{
1092 struct rb_node *next_node = rq->dl.rb_leftmost;
1093 struct sched_dl_entity *dl_se;
1094 struct task_struct *p = NULL;
1095
1096next_node:
1097 next_node = rb_next(next_node);
1098 if (next_node) {
1099 dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
1100 p = dl_task_of(dl_se);
1101
1102 if (pick_dl_task(rq, p, cpu))
1103 return p;
1104
1105 goto next_node;
1106 }
1107
1108 return NULL;
1109}
1110
1111static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1112
1113static int find_later_rq(struct task_struct *task)
1114{
1115 struct sched_domain *sd;
1116 struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
1117 int this_cpu = smp_processor_id();
1118 int best_cpu, cpu = task_cpu(task);
1119
1120 /* Make sure the mask is initialized first */
1121 if (unlikely(!later_mask))
1122 return -1;
1123
1124 if (task->nr_cpus_allowed == 1)
1125 return -1;
1126
1127 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1128 task, later_mask);
1129 if (best_cpu == -1)
1130 return -1;
1131
1132 /*
1133 * If we are here, some target has been found,
1134 * the most suitable of which is cached in best_cpu.
1135 * This is, among the runqueues where the current tasks
1136 * have later deadlines than the task's one, the rq
1137 * with the latest possible one.
1138 *
1139 * Now we check how well this matches with task's
1140 * affinity and system topology.
1141 *
1142 * The last cpu where the task run is our first
1143 * guess, since it is most likely cache-hot there.
1144 */
1145 if (cpumask_test_cpu(cpu, later_mask))
1146 return cpu;
1147 /*
1148 * Check if this_cpu is to be skipped (i.e., it is
1149 * not in the mask) or not.
1150 */
1151 if (!cpumask_test_cpu(this_cpu, later_mask))
1152 this_cpu = -1;
1153
1154 rcu_read_lock();
1155 for_each_domain(cpu, sd) {
1156 if (sd->flags & SD_WAKE_AFFINE) {
1157
1158 /*
1159 * If possible, preempting this_cpu is
1160 * cheaper than migrating.
1161 */
1162 if (this_cpu != -1 &&
1163 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1164 rcu_read_unlock();
1165 return this_cpu;
1166 }
1167
1168 /*
1169 * Last chance: if best_cpu is valid and is
1170 * in the mask, that becomes our choice.
1171 */
1172 if (best_cpu < nr_cpu_ids &&
1173 cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
1174 rcu_read_unlock();
1175 return best_cpu;
1176 }
1177 }
1178 }
1179 rcu_read_unlock();
1180
1181 /*
1182 * At this point, all our guesses failed, we just return
1183 * 'something', and let the caller sort the things out.
1184 */
1185 if (this_cpu != -1)
1186 return this_cpu;
1187
1188 cpu = cpumask_any(later_mask);
1189 if (cpu < nr_cpu_ids)
1190 return cpu;
1191
1192 return -1;
1193}
1194
1195/* Locks the rq it finds */
1196static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1197{
1198 struct rq *later_rq = NULL;
1199 int tries;
1200 int cpu;
1201
1202 for (tries = 0; tries < DL_MAX_TRIES; tries++) {
1203 cpu = find_later_rq(task);
1204
1205 if ((cpu == -1) || (cpu == rq->cpu))
1206 break;
1207
1208 later_rq = cpu_rq(cpu);
1209
1210 /* Retry if something changed. */
1211 if (double_lock_balance(rq, later_rq)) {
1212 if (unlikely(task_rq(task) != rq ||
1213 !cpumask_test_cpu(later_rq->cpu,
1214 &task->cpus_allowed) ||
1215 task_running(rq, task) || !task->on_rq)) {
1216 double_unlock_balance(rq, later_rq);
1217 later_rq = NULL;
1218 break;
1219 }
1220 }
1221
1222 /*
1223 * If the rq we found has no -deadline task, or
1224 * its earliest one has a later deadline than our
1225 * task, the rq is a good one.
1226 */
1227 if (!later_rq->dl.dl_nr_running ||
1228 dl_time_before(task->dl.deadline,
1229 later_rq->dl.earliest_dl.curr))
1230 break;
1231
1232 /* Otherwise we try again. */
1233 double_unlock_balance(rq, later_rq);
1234 later_rq = NULL;
1235 }
1236
1237 return later_rq;
1238}
1239
1240static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1241{
1242 struct task_struct *p;
1243
1244 if (!has_pushable_dl_tasks(rq))
1245 return NULL;
1246
1247 p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
1248 struct task_struct, pushable_dl_tasks);
1249
1250 BUG_ON(rq->cpu != task_cpu(p));
1251 BUG_ON(task_current(rq, p));
1252 BUG_ON(p->nr_cpus_allowed <= 1);
1253
1254 BUG_ON(!p->on_rq);
1255 BUG_ON(!dl_task(p));
1256
1257 return p;
1258}
1259
1260/*
1261 * See if the non running -deadline tasks on this rq
1262 * can be sent to some other CPU where they can preempt
1263 * and start executing.
1264 */
1265static int push_dl_task(struct rq *rq)
1266{
1267 struct task_struct *next_task;
1268 struct rq *later_rq;
1269
1270 if (!rq->dl.overloaded)
1271 return 0;
1272
1273 next_task = pick_next_pushable_dl_task(rq);
1274 if (!next_task)
1275 return 0;
1276
1277retry:
1278 if (unlikely(next_task == rq->curr)) {
1279 WARN_ON(1);
1280 return 0;
1281 }
1282
1283 /*
1284 * If next_task preempts rq->curr, and rq->curr
1285 * can move away, it makes sense to just reschedule
1286 * without going further in pushing next_task.
1287 */
1288 if (dl_task(rq->curr) &&
1289 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1290 rq->curr->nr_cpus_allowed > 1) {
1291 resched_task(rq->curr);
1292 return 0;
1293 }
1294
1295 /* We might release rq lock */
1296 get_task_struct(next_task);
1297
1298 /* Will lock the rq it'll find */
1299 later_rq = find_lock_later_rq(next_task, rq);
1300 if (!later_rq) {
1301 struct task_struct *task;
1302
1303 /*
1304 * We must check all this again, since
1305 * find_lock_later_rq releases rq->lock and it is
1306 * then possible that next_task has migrated.
1307 */
1308 task = pick_next_pushable_dl_task(rq);
1309 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1310 /*
1311 * The task is still there. We don't try
1312 * again, some other cpu will pull it when ready.
1313 */
1314 dequeue_pushable_dl_task(rq, next_task);
1315 goto out;
1316 }
1317
1318 if (!task)
1319 /* No more tasks */
1320 goto out;
1321
1322 put_task_struct(next_task);
1323 next_task = task;
1324 goto retry;
1325 }
1326
1327 deactivate_task(rq, next_task, 0);
1328 set_task_cpu(next_task, later_rq->cpu);
1329 activate_task(later_rq, next_task, 0);
1330
1331 resched_task(later_rq->curr);
1332
1333 double_unlock_balance(rq, later_rq);
1334
1335out:
1336 put_task_struct(next_task);
1337
1338 return 1;
1339}
1340
1341static void push_dl_tasks(struct rq *rq)
1342{
1343 /* Terminates as it moves a -deadline task */
1344 while (push_dl_task(rq))
1345 ;
1346}
1347
1348static int pull_dl_task(struct rq *this_rq)
1349{
1350 int this_cpu = this_rq->cpu, ret = 0, cpu;
1351 struct task_struct *p;
1352 struct rq *src_rq;
1353 u64 dmin = LONG_MAX;
1354
1355 if (likely(!dl_overloaded(this_rq)))
1356 return 0;
1357
1358 /*
1359 * Match the barrier from dl_set_overloaded; this guarantees that if we
1360 * see overloaded we must also see the dlo_mask bit.
1361 */
1362 smp_rmb();
1363
1364 for_each_cpu(cpu, this_rq->rd->dlo_mask) {
1365 if (this_cpu == cpu)
1366 continue;
1367
1368 src_rq = cpu_rq(cpu);
1369
1370 /*
1371 * It looks racy, abd it is! However, as in sched_rt.c,
1372 * we are fine with this.
1373 */
1374 if (this_rq->dl.dl_nr_running &&
1375 dl_time_before(this_rq->dl.earliest_dl.curr,
1376 src_rq->dl.earliest_dl.next))
1377 continue;
1378
1379 /* Might drop this_rq->lock */
1380 double_lock_balance(this_rq, src_rq);
1381
1382 /*
1383 * If there are no more pullable tasks on the
1384 * rq, we're done with it.
1385 */
1386 if (src_rq->dl.dl_nr_running <= 1)
1387 goto skip;
1388
1389 p = pick_next_earliest_dl_task(src_rq, this_cpu);
1390
1391 /*
1392 * We found a task to be pulled if:
1393 * - it preempts our current (if there's one),
1394 * - it will preempt the last one we pulled (if any).
1395 */
1396 if (p && dl_time_before(p->dl.deadline, dmin) &&
1397 (!this_rq->dl.dl_nr_running ||
1398 dl_time_before(p->dl.deadline,
1399 this_rq->dl.earliest_dl.curr))) {
1400 WARN_ON(p == src_rq->curr);
1401 WARN_ON(!p->on_rq);
1402
1403 /*
1404 * Then we pull iff p has actually an earlier
1405 * deadline than the current task of its runqueue.
1406 */
1407 if (dl_time_before(p->dl.deadline,
1408 src_rq->curr->dl.deadline))
1409 goto skip;
1410
1411 ret = 1;
1412
1413 deactivate_task(src_rq, p, 0);
1414 set_task_cpu(p, this_cpu);
1415 activate_task(this_rq, p, 0);
1416 dmin = p->dl.deadline;
1417
1418 /* Is there any other task even earlier? */
1419 }
1420skip:
1421 double_unlock_balance(this_rq, src_rq);
1422 }
1423
1424 return ret;
1425}
1426
1427static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1428{
1429 /* Try to pull other tasks here */
1430 if (dl_task(prev))
1431 pull_dl_task(rq);
1432}
1433
1434static void post_schedule_dl(struct rq *rq)
1435{
1436 push_dl_tasks(rq);
1437}
1438
1439/*
1440 * Since the task is not running and a reschedule is not going to happen
1441 * anytime soon on its runqueue, we try pushing it away now.
1442 */
1443static void task_woken_dl(struct rq *rq, struct task_struct *p)
1444{
1445 if (!task_running(rq, p) &&
1446 !test_tsk_need_resched(rq->curr) &&
1447 has_pushable_dl_tasks(rq) &&
1448 p->nr_cpus_allowed > 1 &&
1449 dl_task(rq->curr) &&
1450 (rq->curr->nr_cpus_allowed < 2 ||
1451 dl_entity_preempt(&rq->curr->dl, &p->dl))) {
1452 push_dl_tasks(rq);
1453 }
1454}
1455
1456static void set_cpus_allowed_dl(struct task_struct *p,
1457 const struct cpumask *new_mask)
1458{
1459 struct rq *rq;
1460 int weight;
1461
1462 BUG_ON(!dl_task(p));
1463
1464 /*
1465 * Update only if the task is actually running (i.e.,
1466 * it is on the rq AND it is not throttled).
1467 */
1468 if (!on_dl_rq(&p->dl))
1469 return;
1470
1471 weight = cpumask_weight(new_mask);
1472
1473 /*
1474 * Only update if the process changes its state from whether it
1475 * can migrate or not.
1476 */
1477 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1478 return;
1479
1480 rq = task_rq(p);
1481
1482 /*
1483 * The process used to be able to migrate OR it can now migrate
1484 */
1485 if (weight <= 1) {
1486 if (!task_current(rq, p))
1487 dequeue_pushable_dl_task(rq, p);
1488 BUG_ON(!rq->dl.dl_nr_migratory);
1489 rq->dl.dl_nr_migratory--;
1490 } else {
1491 if (!task_current(rq, p))
1492 enqueue_pushable_dl_task(rq, p);
1493 rq->dl.dl_nr_migratory++;
1494 }
1495
1496 update_dl_migration(&rq->dl);
1497}
1498
1499/* Assumes rq->lock is held */
1500static void rq_online_dl(struct rq *rq)
1501{
1502 if (rq->dl.overloaded)
1503 dl_set_overload(rq);
1504
1505 if (rq->dl.dl_nr_running > 0)
1506 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1507}
1508
1509/* Assumes rq->lock is held */
1510static void rq_offline_dl(struct rq *rq)
1511{
1512 if (rq->dl.overloaded)
1513 dl_clear_overload(rq);
1514
1515 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1516}
1517
1518void init_sched_dl_class(void)
1519{
1520 unsigned int i;
1521
1522 for_each_possible_cpu(i)
1523 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
1524 GFP_KERNEL, cpu_to_node(i));
1525}
1526
1527#endif /* CONFIG_SMP */
1528
1529static void switched_from_dl(struct rq *rq, struct task_struct *p)
1530{
1531 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1532 hrtimer_try_to_cancel(&p->dl.dl_timer);
1533
1534#ifdef CONFIG_SMP
1535 /*
1536 * Since this might be the only -deadline task on the rq,
1537 * this is the right place to try to pull some other one
1538 * from an overloaded cpu, if any.
1539 */
1540 if (!rq->dl.dl_nr_running)
1541 pull_dl_task(rq);
1542#endif
1543}
1544
1545/*
1546 * When switching to -deadline, we may overload the rq, then
1547 * we try to push someone off, if possible.
1548 */
1549static void switched_to_dl(struct rq *rq, struct task_struct *p)
1550{
1551 int check_resched = 1;
1552
1553 /*
1554 * If p is throttled, don't consider the possibility
1555 * of preempting rq->curr, the check will be done right
1556 * after its runtime will get replenished.
1557 */
1558 if (unlikely(p->dl.dl_throttled))
1559 return;
1560
1561 if (p->on_rq || rq->curr != p) {
1562#ifdef CONFIG_SMP
1563 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1564 /* Only reschedule if pushing failed */
1565 check_resched = 0;
1566#endif /* CONFIG_SMP */
1567 if (check_resched && task_has_dl_policy(rq->curr))
1568 check_preempt_curr_dl(rq, p, 0);
1569 }
1570}
1571
1572/*
1573 * If the scheduling parameters of a -deadline task changed,
1574 * a push or pull operation might be needed.
1575 */
1576static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1577 int oldprio)
1578{
1579 if (p->on_rq || rq->curr == p) {
1580#ifdef CONFIG_SMP
1581 /*
1582 * This might be too much, but unfortunately
1583 * we don't have the old deadline value, and
1584 * we can't argue if the task is increasing
1585 * or lowering its prio, so...
1586 */
1587 if (!rq->dl.overloaded)
1588 pull_dl_task(rq);
1589
1590 /*
1591 * If we now have a earlier deadline task than p,
1592 * then reschedule, provided p is still on this
1593 * runqueue.
1594 */
1595 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1596 rq->curr == p)
1597 resched_task(p);
1598#else
1599 /*
1600 * Again, we don't know if p has a earlier
1601 * or later deadline, so let's blindly set a
1602 * (maybe not needed) rescheduling point.
1603 */
1604 resched_task(p);
1605#endif /* CONFIG_SMP */
1606 } else
1607 switched_to_dl(rq, p);
1608}
1609
1610const struct sched_class dl_sched_class = {
1611 .next = &rt_sched_class,
1612 .enqueue_task = enqueue_task_dl,
1613 .dequeue_task = dequeue_task_dl,
1614 .yield_task = yield_task_dl,
1615
1616 .check_preempt_curr = check_preempt_curr_dl,
1617
1618 .pick_next_task = pick_next_task_dl,
1619 .put_prev_task = put_prev_task_dl,
1620
1621#ifdef CONFIG_SMP
1622 .select_task_rq = select_task_rq_dl,
1623 .set_cpus_allowed = set_cpus_allowed_dl,
1624 .rq_online = rq_online_dl,
1625 .rq_offline = rq_offline_dl,
1626 .pre_schedule = pre_schedule_dl,
1627 .post_schedule = post_schedule_dl,
1628 .task_woken = task_woken_dl,
1629#endif
1630
1631 .set_curr_task = set_curr_task_dl,
1632 .task_tick = task_tick_dl,
1633 .task_fork = task_fork_dl,
1634 .task_dead = task_dead_dl,
1635
1636 .prio_changed = prio_changed_dl,
1637 .switched_from = switched_from_dl,
1638 .switched_to = switched_to_dl,
1639};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d1817e8f..dd52e7ffb10e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
140#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING 141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); 142 SEQ_printf(m, " %d", task_node(p));
143#endif 143#endif
144#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
145 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
371 PN(cpu_clk); 371 PN(cpu_clk);
372 P(jiffies); 372 P(jiffies);
373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
374 P(sched_clock_stable); 374 P(sched_clock_stable());
375#endif 375#endif
376#undef PN 376#undef PN
377#undef P 377#undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d97e4cb..9b4c4f320130 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
872 return max(smin, smax); 872 return max(smin, smax);
873} 873}
874 874
875/*
876 * Once a preferred node is selected the scheduler balancer will prefer moving
877 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
878 * scans. This will give the process the chance to accumulate more faults on
879 * the preferred node but still allow the scheduler to move the task again if
880 * the nodes CPUs are overloaded.
881 */
882unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883
884static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 875static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
885{ 876{
886 rq->nr_numa_running += (p->numa_preferred_nid != -1); 877 rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
930 if (!p->numa_group) 921 if (!p->numa_group)
931 return 0; 922 return 0;
932 923
933 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; 924 return p->numa_group->faults[task_faults_idx(nid, 0)] +
925 p->numa_group->faults[task_faults_idx(nid, 1)];
934} 926}
935 927
936/* 928/*
@@ -1023,7 +1015,7 @@ struct task_numa_env {
1023 1015
1024 struct numa_stats src_stats, dst_stats; 1016 struct numa_stats src_stats, dst_stats;
1025 1017
1026 int imbalance_pct, idx; 1018 int imbalance_pct;
1027 1019
1028 struct task_struct *best_task; 1020 struct task_struct *best_task;
1029 long best_imp; 1021 long best_imp;
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
1211 * elsewhere, so there is no point in (re)trying. 1203 * elsewhere, so there is no point in (re)trying.
1212 */ 1204 */
1213 if (unlikely(!sd)) { 1205 if (unlikely(!sd)) {
1214 p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1206 p->numa_preferred_nid = task_node(p);
1215 return -EINVAL; 1207 return -EINVAL;
1216 } 1208 }
1217 1209
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
1258 p->numa_scan_period = task_scan_min(p); 1250 p->numa_scan_period = task_scan_min(p);
1259 1251
1260 if (env.best_task == NULL) { 1252 if (env.best_task == NULL) {
1261 int ret = migrate_task_to(p, env.best_cpu); 1253 ret = migrate_task_to(p, env.best_cpu);
1254 if (ret != 0)
1255 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1262 return ret; 1256 return ret;
1263 } 1257 }
1264 1258
1265 ret = migrate_swap(p, env.best_task); 1259 ret = migrate_swap(p, env.best_task);
1260 if (ret != 0)
1261 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1266 put_task_struct(env.best_task); 1262 put_task_struct(env.best_task);
1267 return ret; 1263 return ret;
1268} 1264}
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1278 p->numa_migrate_retry = jiffies + HZ; 1274 p->numa_migrate_retry = jiffies + HZ;
1279 1275
1280 /* Success if task is already running on preferred CPU */ 1276 /* Success if task is already running on preferred CPU */
1281 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1277 if (task_node(p) == p->numa_preferred_nid)
1282 return; 1278 return;
1283 1279
1284 /* Otherwise, try migrate to a CPU on the preferred node */ 1280 /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
1350 * scanning faster if shared accesses dominate as it may 1346 * scanning faster if shared accesses dominate as it may
1351 * simply bounce migrations uselessly 1347 * simply bounce migrations uselessly
1352 */ 1348 */
1353 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1354 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1349 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1355 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1350 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1356 } 1351 }
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
1762 start = end; 1757 start = end;
1763 if (pages <= 0) 1758 if (pages <= 0)
1764 goto out; 1759 goto out;
1760
1761 cond_resched();
1765 } while (end != vma->vm_end); 1762 } while (end != vma->vm_end);
1766 } 1763 }
1767 1764
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2365 } 2362 }
2366 wakeup = 0; 2363 wakeup = 0;
2367 } else { 2364 } else {
2368 /* 2365 __synchronize_entity_decay(se);
2369 * Task re-woke on same cpu (or else migrate_task_rq_fair()
2370 * would have made count negative); we must be careful to avoid
2371 * double-accounting blocked time after synchronizing decays.
2372 */
2373 se->avg.last_runnable_update += __synchronize_entity_decay(se)
2374 << 20;
2375 } 2366 }
2376 2367
2377 /* migrated tasks did not contribute to our blocked load */ 2368 /* migrated tasks did not contribute to our blocked load */
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3923{ 3914{
3924 struct sched_entity *se = tg->se[cpu]; 3915 struct sched_entity *se = tg->se[cpu];
3925 3916
3926 if (!tg->parent || !wl) /* the trivial, non-cgroup case */ 3917 if (!tg->parent) /* the trivial, non-cgroup case */
3927 return wl; 3918 return wl;
3928 3919
3929 for_each_sched_entity(se) { 3920 for_each_sched_entity(se) {
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4101 */ 4092 */
4102static struct sched_group * 4093static struct sched_group *
4103find_idlest_group(struct sched_domain *sd, struct task_struct *p, 4094find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4104 int this_cpu, int load_idx) 4095 int this_cpu, int sd_flag)
4105{ 4096{
4106 struct sched_group *idlest = NULL, *group = sd->groups; 4097 struct sched_group *idlest = NULL, *group = sd->groups;
4107 unsigned long min_load = ULONG_MAX, this_load = 0; 4098 unsigned long min_load = ULONG_MAX, this_load = 0;
4099 int load_idx = sd->forkexec_idx;
4108 int imbalance = 100 + (sd->imbalance_pct-100)/2; 4100 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4109 4101
4102 if (sd_flag & SD_BALANCE_WAKE)
4103 load_idx = sd->wake_idx;
4104
4110 do { 4105 do {
4111 unsigned long load, avg_load; 4106 unsigned long load, avg_load;
4112 int local_group; 4107 int local_group;
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4274 } 4269 }
4275 4270
4276 while (sd) { 4271 while (sd) {
4277 int load_idx = sd->forkexec_idx;
4278 struct sched_group *group; 4272 struct sched_group *group;
4279 int weight; 4273 int weight;
4280 4274
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4283 continue; 4277 continue;
4284 } 4278 }
4285 4279
4286 if (sd_flag & SD_BALANCE_WAKE) 4280 group = find_idlest_group(sd, p, cpu, sd_flag);
4287 load_idx = sd->wake_idx;
4288
4289 group = find_idlest_group(sd, p, cpu, load_idx);
4290 if (!group) { 4281 if (!group) {
4291 sd = sd->child; 4282 sd = sd->child;
4292 continue; 4283 continue;
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5512 struct sched_group *group, int load_idx, 5503 struct sched_group *group, int load_idx,
5513 int local_group, struct sg_lb_stats *sgs) 5504 int local_group, struct sg_lb_stats *sgs)
5514{ 5505{
5515 unsigned long nr_running;
5516 unsigned long load; 5506 unsigned long load;
5517 int i; 5507 int i;
5518 5508
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5511 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5522 struct rq *rq = cpu_rq(i); 5512 struct rq *rq = cpu_rq(i);
5523 5513
5524 nr_running = rq->nr_running;
5525
5526 /* Bias balancing toward cpus of our domain */ 5514 /* Bias balancing toward cpus of our domain */
5527 if (local_group) 5515 if (local_group)
5528 load = target_load(i, load_idx); 5516 load = target_load(i, load_idx);
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5530 load = source_load(i, load_idx); 5518 load = source_load(i, load_idx);
5531 5519
5532 sgs->group_load += load; 5520 sgs->group_load += load;
5533 sgs->sum_nr_running += nr_running; 5521 sgs->sum_nr_running += rq->nr_running;
5534#ifdef CONFIG_NUMA_BALANCING 5522#ifdef CONFIG_NUMA_BALANCING
5535 sgs->nr_numa_running += rq->nr_numa_running; 5523 sgs->nr_numa_running += rq->nr_numa_running;
5536 sgs->nr_preferred_running += rq->nr_preferred_running; 5524 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6521,7 +6509,7 @@ static struct {
6521 unsigned long next_balance; /* in jiffy units */ 6509 unsigned long next_balance; /* in jiffy units */
6522} nohz ____cacheline_aligned; 6510} nohz ____cacheline_aligned;
6523 6511
6524static inline int find_new_ilb(int call_cpu) 6512static inline int find_new_ilb(void)
6525{ 6513{
6526 int ilb = cpumask_first(nohz.idle_cpus_mask); 6514 int ilb = cpumask_first(nohz.idle_cpus_mask);
6527 6515
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
6536 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 6524 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6537 * CPU (if there is one). 6525 * CPU (if there is one).
6538 */ 6526 */
6539static void nohz_balancer_kick(int cpu) 6527static void nohz_balancer_kick(void)
6540{ 6528{
6541 int ilb_cpu; 6529 int ilb_cpu;
6542 6530
6543 nohz.next_balance++; 6531 nohz.next_balance++;
6544 6532
6545 ilb_cpu = find_new_ilb(cpu); 6533 ilb_cpu = find_new_ilb();
6546 6534
6547 if (ilb_cpu >= nr_cpu_ids) 6535 if (ilb_cpu >= nr_cpu_ids)
6548 return; 6536 return;
@@ -6652,10 +6640,10 @@ void update_max_interval(void)
6652 * 6640 *
6653 * Balancing parameters are set up in init_sched_domains. 6641 * Balancing parameters are set up in init_sched_domains.
6654 */ 6642 */
6655static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6643static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6656{ 6644{
6657 int continue_balancing = 1; 6645 int continue_balancing = 1;
6658 struct rq *rq = cpu_rq(cpu); 6646 int cpu = rq->cpu;
6659 unsigned long interval; 6647 unsigned long interval;
6660 struct sched_domain *sd; 6648 struct sched_domain *sd;
6661 /* Earliest time when we have to do rebalance again */ 6649 /* Earliest time when we have to do rebalance again */
@@ -6752,9 +6740,9 @@ out:
6752 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 6740 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6753 * rebalancing for all the cpus for whom scheduler ticks are stopped. 6741 * rebalancing for all the cpus for whom scheduler ticks are stopped.
6754 */ 6742 */
6755static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 6743static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
6756{ 6744{
6757 struct rq *this_rq = cpu_rq(this_cpu); 6745 int this_cpu = this_rq->cpu;
6758 struct rq *rq; 6746 struct rq *rq;
6759 int balance_cpu; 6747 int balance_cpu;
6760 6748
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6781 update_idle_cpu_load(rq); 6769 update_idle_cpu_load(rq);
6782 raw_spin_unlock_irq(&rq->lock); 6770 raw_spin_unlock_irq(&rq->lock);
6783 6771
6784 rebalance_domains(balance_cpu, CPU_IDLE); 6772 rebalance_domains(rq, CPU_IDLE);
6785 6773
6786 if (time_after(this_rq->next_balance, rq->next_balance)) 6774 if (time_after(this_rq->next_balance, rq->next_balance))
6787 this_rq->next_balance = rq->next_balance; 6775 this_rq->next_balance = rq->next_balance;
@@ -6800,14 +6788,14 @@ end:
6800 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 6788 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6801 * domain span are idle. 6789 * domain span are idle.
6802 */ 6790 */
6803static inline int nohz_kick_needed(struct rq *rq, int cpu) 6791static inline int nohz_kick_needed(struct rq *rq)
6804{ 6792{
6805 unsigned long now = jiffies; 6793 unsigned long now = jiffies;
6806 struct sched_domain *sd; 6794 struct sched_domain *sd;
6807 struct sched_group_power *sgp; 6795 struct sched_group_power *sgp;
6808 int nr_busy; 6796 int nr_busy, cpu = rq->cpu;
6809 6797
6810 if (unlikely(idle_cpu(cpu))) 6798 if (unlikely(rq->idle_balance))
6811 return 0; 6799 return 0;
6812 6800
6813 /* 6801 /*
@@ -6856,7 +6844,7 @@ need_kick:
6856 return 1; 6844 return 1;
6857} 6845}
6858#else 6846#else
6859static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 6847static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
6860#endif 6848#endif
6861 6849
6862/* 6850/*
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6865 */ 6853 */
6866static void run_rebalance_domains(struct softirq_action *h) 6854static void run_rebalance_domains(struct softirq_action *h)
6867{ 6855{
6868 int this_cpu = smp_processor_id(); 6856 struct rq *this_rq = this_rq();
6869 struct rq *this_rq = cpu_rq(this_cpu);
6870 enum cpu_idle_type idle = this_rq->idle_balance ? 6857 enum cpu_idle_type idle = this_rq->idle_balance ?
6871 CPU_IDLE : CPU_NOT_IDLE; 6858 CPU_IDLE : CPU_NOT_IDLE;
6872 6859
6873 rebalance_domains(this_cpu, idle); 6860 rebalance_domains(this_rq, idle);
6874 6861
6875 /* 6862 /*
6876 * If this cpu has a pending nohz_balance_kick, then do the 6863 * If this cpu has a pending nohz_balance_kick, then do the
6877 * balancing on behalf of the other idle cpus whose ticks are 6864 * balancing on behalf of the other idle cpus whose ticks are
6878 * stopped. 6865 * stopped.
6879 */ 6866 */
6880 nohz_idle_balance(this_cpu, idle); 6867 nohz_idle_balance(this_rq, idle);
6881} 6868}
6882 6869
6883static inline int on_null_domain(int cpu) 6870static inline int on_null_domain(struct rq *rq)
6884{ 6871{
6885 return !rcu_dereference_sched(cpu_rq(cpu)->sd); 6872 return !rcu_dereference_sched(rq->sd);
6886} 6873}
6887 6874
6888/* 6875/*
6889 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6890 */ 6877 */
6891void trigger_load_balance(struct rq *rq, int cpu) 6878void trigger_load_balance(struct rq *rq)
6892{ 6879{
6893 /* Don't need to rebalance while attached to NULL domain */ 6880 /* Don't need to rebalance while attached to NULL domain */
6894 if (time_after_eq(jiffies, rq->next_balance) && 6881 if (unlikely(on_null_domain(rq)))
6895 likely(!on_null_domain(cpu))) 6882 return;
6883
6884 if (time_after_eq(jiffies, rq->next_balance))
6896 raise_softirq(SCHED_SOFTIRQ); 6885 raise_softirq(SCHED_SOFTIRQ);
6897#ifdef CONFIG_NO_HZ_COMMON 6886#ifdef CONFIG_NO_HZ_COMMON
6898 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 6887 if (nohz_kick_needed(rq))
6899 nohz_balancer_kick(cpu); 6888 nohz_balancer_kick();
6900#endif 6889#endif
6901} 6890}
6902 6891
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7012 struct cfs_rq *cfs_rq = cfs_rq_of(se); 7001 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7013 7002
7014 /* 7003 /*
7015 * Ensure the task's vruntime is normalized, so that when its 7004 * Ensure the task's vruntime is normalized, so that when it's
7016 * switched back to the fair class the enqueue_entity(.flags=0) will 7005 * switched back to the fair class the enqueue_entity(.flags=0) will
7017 * do the right thing. 7006 * do the right thing.
7018 * 7007 *
7019 * If it was on_rq, then the dequeue_entity(.flags=0) will already 7008 * If it's on_rq, then the dequeue_entity(.flags=0) will already
7020 * have normalized the vruntime, if it was !on_rq, then only when 7009 * have normalized the vruntime, if it's !on_rq, then only when
7021 * the task is sleeping will it still have non-normalized vruntime. 7010 * the task is sleeping will it still have non-normalized vruntime.
7022 */ 7011 */
7023 if (!se->on_rq && p->state != TASK_RUNNING) { 7012 if (!p->on_rq && p->state != TASK_RUNNING) {
7024 /* 7013 /*
7025 * Fix up our vruntime so that the current sleep doesn't 7014 * Fix up our vruntime so that the current sleep doesn't
7026 * cause 'unlimited' sleep bonus. 7015 * cause 'unlimited' sleep bonus.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1c4065575fa2..1999021042c7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
538 538
539#endif /* CONFIG_RT_GROUP_SCHED */ 539#endif /* CONFIG_RT_GROUP_SCHED */
540 540
541bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
542{
543 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
544
545 return (hrtimer_active(&rt_b->rt_period_timer) ||
546 rt_rq->rt_time < rt_b->rt_runtime);
547}
548
541#ifdef CONFIG_SMP 549#ifdef CONFIG_SMP
542/* 550/*
543 * We ran out of runtime, see if we can borrow some from our neighbours. 551 * We ran out of runtime, see if we can borrow some from our neighbours.
@@ -1738,7 +1746,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1738 !test_tsk_need_resched(rq->curr) && 1746 !test_tsk_need_resched(rq->curr) &&
1739 has_pushable_tasks(rq) && 1747 has_pushable_tasks(rq) &&
1740 p->nr_cpus_allowed > 1 && 1748 p->nr_cpus_allowed > 1 &&
1741 rt_task(rq->curr) && 1749 (dl_task(rq->curr) || rt_task(rq->curr)) &&
1742 (rq->curr->nr_cpus_allowed < 2 || 1750 (rq->curr->nr_cpus_allowed < 2 ||
1743 rq->curr->prio <= p->prio)) 1751 rq->curr->prio <= p->prio))
1744 push_rt_tasks(rq); 1752 push_rt_tasks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b21d633..f964add50f38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h>
5#include <linux/mutex.h> 6#include <linux/mutex.h>
6#include <linux/spinlock.h> 7#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
@@ -9,6 +10,7 @@
9#include <linux/slab.h> 10#include <linux/slab.h>
10 11
11#include "cpupri.h" 12#include "cpupri.h"
13#include "cpudeadline.h"
12#include "cpuacct.h" 14#include "cpuacct.h"
13 15
14struct rq; 16struct rq;
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
73#define NICE_0_SHIFT SCHED_LOAD_SHIFT 75#define NICE_0_SHIFT SCHED_LOAD_SHIFT
74 76
75/* 77/*
78 * Single value that decides SCHED_DEADLINE internal math precision.
79 * 10 -> just above 1us
80 * 9 -> just above 0.5us
81 */
82#define DL_SCALE (10)
83
84/*
76 * These are the 'tuning knobs' of the scheduler: 85 * These are the 'tuning knobs' of the scheduler:
77 */ 86 */
78 87
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
81 */ 90 */
82#define RUNTIME_INF ((u64)~0ULL) 91#define RUNTIME_INF ((u64)~0ULL)
83 92
93static inline int fair_policy(int policy)
94{
95 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
96}
97
84static inline int rt_policy(int policy) 98static inline int rt_policy(int policy)
85{ 99{
86 if (policy == SCHED_FIFO || policy == SCHED_RR) 100 return policy == SCHED_FIFO || policy == SCHED_RR;
87 return 1; 101}
88 return 0; 102
103static inline int dl_policy(int policy)
104{
105 return policy == SCHED_DEADLINE;
89} 106}
90 107
91static inline int task_has_rt_policy(struct task_struct *p) 108static inline int task_has_rt_policy(struct task_struct *p)
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
93 return rt_policy(p->policy); 110 return rt_policy(p->policy);
94} 111}
95 112
113static inline int task_has_dl_policy(struct task_struct *p)
114{
115 return dl_policy(p->policy);
116}
117
118static inline bool dl_time_before(u64 a, u64 b)
119{
120 return (s64)(a - b) < 0;
121}
122
123/*
124 * Tells if entity @a should preempt entity @b.
125 */
126static inline bool
127dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
128{
129 return dl_time_before(a->deadline, b->deadline);
130}
131
96/* 132/*
97 * This is the priority-queue data structure of the RT scheduling class: 133 * This is the priority-queue data structure of the RT scheduling class:
98 */ 134 */
@@ -108,6 +144,47 @@ struct rt_bandwidth {
108 u64 rt_runtime; 144 u64 rt_runtime;
109 struct hrtimer rt_period_timer; 145 struct hrtimer rt_period_timer;
110}; 146};
147/*
148 * To keep the bandwidth of -deadline tasks and groups under control
149 * we need some place where:
150 * - store the maximum -deadline bandwidth of the system (the group);
151 * - cache the fraction of that bandwidth that is currently allocated.
152 *
153 * This is all done in the data structure below. It is similar to the
154 * one used for RT-throttling (rt_bandwidth), with the main difference
155 * that, since here we are only interested in admission control, we
156 * do not decrease any runtime while the group "executes", neither we
157 * need a timer to replenish it.
158 *
159 * With respect to SMP, the bandwidth is given on a per-CPU basis,
160 * meaning that:
161 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
162 * - dl_total_bw array contains, in the i-eth element, the currently
163 * allocated bandwidth on the i-eth CPU.
164 * Moreover, groups consume bandwidth on each CPU, while tasks only
165 * consume bandwidth on the CPU they're running on.
166 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
167 * that will be shown the next time the proc or cgroup controls will
168 * be red. It on its turn can be changed by writing on its own
169 * control.
170 */
171struct dl_bandwidth {
172 raw_spinlock_t dl_runtime_lock;
173 u64 dl_runtime;
174 u64 dl_period;
175};
176
177static inline int dl_bandwidth_enabled(void)
178{
179 return sysctl_sched_rt_runtime >= 0;
180}
181
182extern struct dl_bw *dl_bw_of(int i);
183
184struct dl_bw {
185 raw_spinlock_t lock;
186 u64 bw, total_bw;
187};
111 188
112extern struct mutex sched_domains_mutex; 189extern struct mutex sched_domains_mutex;
113 190
@@ -364,6 +441,41 @@ struct rt_rq {
364#endif 441#endif
365}; 442};
366 443
444/* Deadline class' related fields in a runqueue */
445struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */
447 struct rb_root rb_root;
448 struct rb_node *rb_leftmost;
449
450 unsigned long dl_nr_running;
451
452#ifdef CONFIG_SMP
453 /*
454 * Deadline values of the currently executing and the
455 * earliest ready task on this rq. Caching these facilitates
456 * the decision wether or not a ready but not running task
457 * should migrate somewhere else.
458 */
459 struct {
460 u64 curr;
461 u64 next;
462 } earliest_dl;
463
464 unsigned long dl_nr_migratory;
465 int overloaded;
466
467 /*
468 * Tasks on this rq that can be pushed away. They are kept in
469 * an rb-tree, ordered by tasks' deadlines, with caching
470 * of the leftmost (earliest deadline) element.
471 */
472 struct rb_root pushable_dl_tasks_root;
473 struct rb_node *pushable_dl_tasks_leftmost;
474#else
475 struct dl_bw dl_bw;
476#endif
477};
478
367#ifdef CONFIG_SMP 479#ifdef CONFIG_SMP
368 480
369/* 481/*
@@ -382,6 +494,15 @@ struct root_domain {
382 cpumask_var_t online; 494 cpumask_var_t online;
383 495
384 /* 496 /*
497 * The bit corresponding to a CPU gets set here if such CPU has more
498 * than one runnable -deadline task (as it is below for RT tasks).
499 */
500 cpumask_var_t dlo_mask;
501 atomic_t dlo_count;
502 struct dl_bw dl_bw;
503 struct cpudl cpudl;
504
505 /*
385 * The "RT overload" flag: it gets set if a CPU has more than 506 * The "RT overload" flag: it gets set if a CPU has more than
386 * one runnable RT task. 507 * one runnable RT task.
387 */ 508 */
@@ -432,6 +553,7 @@ struct rq {
432 553
433 struct cfs_rq cfs; 554 struct cfs_rq cfs;
434 struct rt_rq rt; 555 struct rt_rq rt;
556 struct dl_rq dl;
435 557
436#ifdef CONFIG_FAIR_GROUP_SCHED 558#ifdef CONFIG_FAIR_GROUP_SCHED
437 /* list of leaf cfs_rq on this cpu: */ 559 /* list of leaf cfs_rq on this cpu: */
@@ -827,8 +949,6 @@ static inline u64 global_rt_runtime(void)
827 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 949 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
828} 950}
829 951
830
831
832static inline int task_current(struct rq *rq, struct task_struct *p) 952static inline int task_current(struct rq *rq, struct task_struct *p)
833{ 953{
834 return rq->curr == p; 954 return rq->curr == p;
@@ -988,6 +1108,7 @@ static const u32 prio_to_wmult[40] = {
988#else 1108#else
989#define ENQUEUE_WAKING 0 1109#define ENQUEUE_WAKING 0
990#endif 1110#endif
1111#define ENQUEUE_REPLENISH 8
991 1112
992#define DEQUEUE_SLEEP 1 1113#define DEQUEUE_SLEEP 1
993 1114
@@ -1023,6 +1144,7 @@ struct sched_class {
1023 void (*set_curr_task) (struct rq *rq); 1144 void (*set_curr_task) (struct rq *rq);
1024 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1145 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1025 void (*task_fork) (struct task_struct *p); 1146 void (*task_fork) (struct task_struct *p);
1147 void (*task_dead) (struct task_struct *p);
1026 1148
1027 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1149 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1028 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1150 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@@ -1042,6 +1164,7 @@ struct sched_class {
1042 for (class = sched_class_highest; class; class = class->next) 1164 for (class = sched_class_highest; class; class = class->next)
1043 1165
1044extern const struct sched_class stop_sched_class; 1166extern const struct sched_class stop_sched_class;
1167extern const struct sched_class dl_sched_class;
1045extern const struct sched_class rt_sched_class; 1168extern const struct sched_class rt_sched_class;
1046extern const struct sched_class fair_sched_class; 1169extern const struct sched_class fair_sched_class;
1047extern const struct sched_class idle_sched_class; 1170extern const struct sched_class idle_sched_class;
@@ -1051,7 +1174,7 @@ extern const struct sched_class idle_sched_class;
1051 1174
1052extern void update_group_power(struct sched_domain *sd, int cpu); 1175extern void update_group_power(struct sched_domain *sd, int cpu);
1053 1176
1054extern void trigger_load_balance(struct rq *rq, int cpu); 1177extern void trigger_load_balance(struct rq *rq);
1055extern void idle_balance(int this_cpu, struct rq *this_rq); 1178extern void idle_balance(int this_cpu, struct rq *this_rq);
1056 1179
1057extern void idle_enter_fair(struct rq *this_rq); 1180extern void idle_enter_fair(struct rq *this_rq);
@@ -1068,8 +1191,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
1068extern void sysrq_sched_debug_show(void); 1191extern void sysrq_sched_debug_show(void);
1069extern void sched_init_granularity(void); 1192extern void sched_init_granularity(void);
1070extern void update_max_interval(void); 1193extern void update_max_interval(void);
1194
1195extern void init_sched_dl_class(void);
1071extern void init_sched_rt_class(void); 1196extern void init_sched_rt_class(void);
1072extern void init_sched_fair_class(void); 1197extern void init_sched_fair_class(void);
1198extern void init_sched_dl_class(void);
1073 1199
1074extern void resched_task(struct task_struct *p); 1200extern void resched_task(struct task_struct *p);
1075extern void resched_cpu(int cpu); 1201extern void resched_cpu(int cpu);
@@ -1077,6 +1203,12 @@ extern void resched_cpu(int cpu);
1077extern struct rt_bandwidth def_rt_bandwidth; 1203extern struct rt_bandwidth def_rt_bandwidth;
1078extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1204extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1079 1205
1206extern struct dl_bandwidth def_dl_bandwidth;
1207extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1208extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1209
1210unsigned long to_ratio(u64 period, u64 runtime);
1211
1080extern void update_idle_cpu_load(struct rq *this_rq); 1212extern void update_idle_cpu_load(struct rq *this_rq);
1081 1213
1082extern void init_task_runnable_average(struct task_struct *p); 1214extern void init_task_runnable_average(struct task_struct *p);
@@ -1353,6 +1485,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1353 1485
1354extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1486extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1487extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1488extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
1356 1489
1357extern void cfs_bandwidth_usage_inc(void); 1490extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void); 1491extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
103 * Simple, special scheduling class for the per-CPU stop tasks: 103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */ 104 */
105const struct sched_class stop_sched_class = { 105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class, 106 .next = &dl_sched_class,
107 107
108 .enqueue_task = enqueue_task_stop, 108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop, 109 .dequeue_task = dequeue_task_stop,