diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 5 | ||||
-rw-r--r-- | kernel/sched/clock.c | 107 | ||||
-rw-r--r-- | kernel/sched/core.c | 880 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 18 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 216 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 33 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 1639 | ||||
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 109 | ||||
-rw-r--r-- | kernel/sched/rt.c | 10 | ||||
-rw-r--r-- | kernel/sched/sched.h | 145 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 2 |
12 files changed, 2931 insertions, 237 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o |
15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | ||
15 | obj-y += wait.o completion.o | 16 | obj-y += wait.o completion.o |
16 | obj-$(CONFIG_SMP) += cpupri.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
18 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
19 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 20 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..b30a2924ef14 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -26,9 +26,10 @@ | |||
26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
27 | * | 27 | * |
28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
30 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current cpu. |
31 | * | 30 | * |
31 | * sched_clock_cpu(i) | ||
32 | * | ||
32 | * How: | 33 | * How: |
33 | * | 34 | * |
34 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
@@ -50,15 +51,6 @@ | |||
50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 51 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
51 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
52 | * | 53 | * |
53 | * | ||
54 | * Notes: | ||
55 | * | ||
56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
58 | * and cause funny jumps in time -- although the filtering provided by | ||
59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
61 | * sched_clock(). | ||
62 | */ | 54 | */ |
63 | #include <linux/spinlock.h> | 55 | #include <linux/spinlock.h> |
64 | #include <linux/hardirq.h> | 56 | #include <linux/hardirq.h> |
@@ -66,6 +58,8 @@ | |||
66 | #include <linux/percpu.h> | 58 | #include <linux/percpu.h> |
67 | #include <linux/ktime.h> | 59 | #include <linux/ktime.h> |
68 | #include <linux/sched.h> | 60 | #include <linux/sched.h> |
61 | #include <linux/static_key.h> | ||
62 | #include <linux/workqueue.h> | ||
69 | 63 | ||
70 | /* | 64 | /* |
71 | * Scheduler clock - returns current time in nanosec units. | 65 | * Scheduler clock - returns current time in nanosec units. |
@@ -82,7 +76,52 @@ EXPORT_SYMBOL_GPL(sched_clock); | |||
82 | __read_mostly int sched_clock_running; | 76 | __read_mostly int sched_clock_running; |
83 | 77 | ||
84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 78 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
85 | __read_mostly int sched_clock_stable; | 79 | static struct static_key __sched_clock_stable = STATIC_KEY_INIT; |
80 | static int __sched_clock_stable_early; | ||
81 | |||
82 | int sched_clock_stable(void) | ||
83 | { | ||
84 | return static_key_false(&__sched_clock_stable); | ||
85 | } | ||
86 | |||
87 | static void __set_sched_clock_stable(void) | ||
88 | { | ||
89 | if (!sched_clock_stable()) | ||
90 | static_key_slow_inc(&__sched_clock_stable); | ||
91 | } | ||
92 | |||
93 | void set_sched_clock_stable(void) | ||
94 | { | ||
95 | __sched_clock_stable_early = 1; | ||
96 | |||
97 | smp_mb(); /* matches sched_clock_init() */ | ||
98 | |||
99 | if (!sched_clock_running) | ||
100 | return; | ||
101 | |||
102 | __set_sched_clock_stable(); | ||
103 | } | ||
104 | |||
105 | static void __clear_sched_clock_stable(struct work_struct *work) | ||
106 | { | ||
107 | /* XXX worry about clock continuity */ | ||
108 | if (sched_clock_stable()) | ||
109 | static_key_slow_dec(&__sched_clock_stable); | ||
110 | } | ||
111 | |||
112 | static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); | ||
113 | |||
114 | void clear_sched_clock_stable(void) | ||
115 | { | ||
116 | __sched_clock_stable_early = 0; | ||
117 | |||
118 | smp_mb(); /* matches sched_clock_init() */ | ||
119 | |||
120 | if (!sched_clock_running) | ||
121 | return; | ||
122 | |||
123 | schedule_work(&sched_clock_work); | ||
124 | } | ||
86 | 125 | ||
87 | struct sched_clock_data { | 126 | struct sched_clock_data { |
88 | u64 tick_raw; | 127 | u64 tick_raw; |
@@ -116,6 +155,20 @@ void sched_clock_init(void) | |||
116 | } | 155 | } |
117 | 156 | ||
118 | sched_clock_running = 1; | 157 | sched_clock_running = 1; |
158 | |||
159 | /* | ||
160 | * Ensure that it is impossible to not do a static_key update. | ||
161 | * | ||
162 | * Either {set,clear}_sched_clock_stable() must see sched_clock_running | ||
163 | * and do the update, or we must see their __sched_clock_stable_early | ||
164 | * and do the update, or both. | ||
165 | */ | ||
166 | smp_mb(); /* matches {set,clear}_sched_clock_stable() */ | ||
167 | |||
168 | if (__sched_clock_stable_early) | ||
169 | __set_sched_clock_stable(); | ||
170 | else | ||
171 | __clear_sched_clock_stable(NULL); | ||
119 | } | 172 | } |
120 | 173 | ||
121 | /* | 174 | /* |
@@ -242,20 +295,20 @@ u64 sched_clock_cpu(int cpu) | |||
242 | struct sched_clock_data *scd; | 295 | struct sched_clock_data *scd; |
243 | u64 clock; | 296 | u64 clock; |
244 | 297 | ||
245 | WARN_ON_ONCE(!irqs_disabled()); | 298 | if (sched_clock_stable()) |
246 | |||
247 | if (sched_clock_stable) | ||
248 | return sched_clock(); | 299 | return sched_clock(); |
249 | 300 | ||
250 | if (unlikely(!sched_clock_running)) | 301 | if (unlikely(!sched_clock_running)) |
251 | return 0ull; | 302 | return 0ull; |
252 | 303 | ||
304 | preempt_disable_notrace(); | ||
253 | scd = cpu_sdc(cpu); | 305 | scd = cpu_sdc(cpu); |
254 | 306 | ||
255 | if (cpu != smp_processor_id()) | 307 | if (cpu != smp_processor_id()) |
256 | clock = sched_clock_remote(scd); | 308 | clock = sched_clock_remote(scd); |
257 | else | 309 | else |
258 | clock = sched_clock_local(scd); | 310 | clock = sched_clock_local(scd); |
311 | preempt_enable_notrace(); | ||
259 | 312 | ||
260 | return clock; | 313 | return clock; |
261 | } | 314 | } |
@@ -265,7 +318,7 @@ void sched_clock_tick(void) | |||
265 | struct sched_clock_data *scd; | 318 | struct sched_clock_data *scd; |
266 | u64 now, now_gtod; | 319 | u64 now, now_gtod; |
267 | 320 | ||
268 | if (sched_clock_stable) | 321 | if (sched_clock_stable()) |
269 | return; | 322 | return; |
270 | 323 | ||
271 | if (unlikely(!sched_clock_running)) | 324 | if (unlikely(!sched_clock_running)) |
@@ -316,14 +369,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
316 | */ | 369 | */ |
317 | u64 cpu_clock(int cpu) | 370 | u64 cpu_clock(int cpu) |
318 | { | 371 | { |
319 | u64 clock; | 372 | if (!sched_clock_stable()) |
320 | unsigned long flags; | 373 | return sched_clock_cpu(cpu); |
321 | |||
322 | local_irq_save(flags); | ||
323 | clock = sched_clock_cpu(cpu); | ||
324 | local_irq_restore(flags); | ||
325 | 374 | ||
326 | return clock; | 375 | return sched_clock(); |
327 | } | 376 | } |
328 | 377 | ||
329 | /* | 378 | /* |
@@ -335,14 +384,10 @@ u64 cpu_clock(int cpu) | |||
335 | */ | 384 | */ |
336 | u64 local_clock(void) | 385 | u64 local_clock(void) |
337 | { | 386 | { |
338 | u64 clock; | 387 | if (!sched_clock_stable()) |
339 | unsigned long flags; | 388 | return sched_clock_cpu(raw_smp_processor_id()); |
340 | 389 | ||
341 | local_irq_save(flags); | 390 | return sched_clock(); |
342 | clock = sched_clock_cpu(smp_processor_id()); | ||
343 | local_irq_restore(flags); | ||
344 | |||
345 | return clock; | ||
346 | } | 391 | } |
347 | 392 | ||
348 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 393 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
@@ -362,12 +407,12 @@ u64 sched_clock_cpu(int cpu) | |||
362 | 407 | ||
363 | u64 cpu_clock(int cpu) | 408 | u64 cpu_clock(int cpu) |
364 | { | 409 | { |
365 | return sched_clock_cpu(cpu); | 410 | return sched_clock(); |
366 | } | 411 | } |
367 | 412 | ||
368 | u64 local_clock(void) | 413 | u64 local_clock(void) |
369 | { | 414 | { |
370 | return sched_clock_cpu(0); | 415 | return sched_clock(); |
371 | } | 416 | } |
372 | 417 | ||
373 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 418 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..f5c6635b806c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running; | |||
296 | */ | 296 | */ |
297 | int sysctl_sched_rt_runtime = 950000; | 297 | int sysctl_sched_rt_runtime = 950000; |
298 | 298 | ||
299 | |||
300 | |||
301 | /* | 299 | /* |
302 | * __task_rq_lock - lock the rq @p resides on. | 300 | * __task_rq_lock - lock the rq @p resides on. |
303 | */ | 301 | */ |
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p) | |||
899 | { | 897 | { |
900 | int prio; | 898 | int prio; |
901 | 899 | ||
902 | if (task_has_rt_policy(p)) | 900 | if (task_has_dl_policy(p)) |
901 | prio = MAX_DL_PRIO-1; | ||
902 | else if (task_has_rt_policy(p)) | ||
903 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 903 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
904 | else | 904 | else |
905 | prio = __normal_prio(p); | 905 | prio = __normal_prio(p); |
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
945 | if (prev_class->switched_from) | 945 | if (prev_class->switched_from) |
946 | prev_class->switched_from(rq, p); | 946 | prev_class->switched_from(rq, p); |
947 | p->sched_class->switched_to(rq, p); | 947 | p->sched_class->switched_to(rq, p); |
948 | } else if (oldprio != p->prio) | 948 | } else if (oldprio != p->prio || dl_task(p)) |
949 | p->sched_class->prio_changed(rq, p, oldprio); | 949 | p->sched_class->prio_changed(rq, p, oldprio); |
950 | } | 950 | } |
951 | 951 | ||
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | 1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) |
1109 | goto out; | 1109 | goto out; |
1110 | 1110 | ||
1111 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); | ||
1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | 1112 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); |
1112 | 1113 | ||
1113 | out: | 1114 | out: |
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void) | |||
1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | 1500 | * TIF_NEED_RESCHED remotely (for the first time) will also send |
1500 | * this IPI. | 1501 | * this IPI. |
1501 | */ | 1502 | */ |
1502 | if (tif_need_resched()) | 1503 | preempt_fold_need_resched(); |
1503 | set_preempt_need_resched(); | ||
1504 | 1504 | ||
1505 | if (llist_empty(&this_rq()->wake_list) | 1505 | if (llist_empty(&this_rq()->wake_list) |
1506 | && !tick_nohz_full_cpu(smp_processor_id()) | 1506 | && !tick_nohz_full_cpu(smp_processor_id()) |
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1718 | #endif | 1718 | #endif |
1719 | 1719 | ||
1720 | RB_CLEAR_NODE(&p->dl.rb_node); | ||
1721 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1722 | p->dl.dl_runtime = p->dl.runtime = 0; | ||
1723 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
1724 | p->dl.dl_period = 0; | ||
1725 | p->dl.flags = 0; | ||
1726 | |||
1720 | INIT_LIST_HEAD(&p->rt.run_list); | 1727 | INIT_LIST_HEAD(&p->rt.run_list); |
1721 | 1728 | ||
1722 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1729 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled) | |||
1763 | numabalancing_enabled = enabled; | 1770 | numabalancing_enabled = enabled; |
1764 | } | 1771 | } |
1765 | #endif /* CONFIG_SCHED_DEBUG */ | 1772 | #endif /* CONFIG_SCHED_DEBUG */ |
1766 | #endif /* CONFIG_NUMA_BALANCING */ | 1773 | |
1774 | #ifdef CONFIG_PROC_SYSCTL | ||
1775 | int sysctl_numa_balancing(struct ctl_table *table, int write, | ||
1776 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1777 | { | ||
1778 | struct ctl_table t; | ||
1779 | int err; | ||
1780 | int state = numabalancing_enabled; | ||
1781 | |||
1782 | if (write && !capable(CAP_SYS_ADMIN)) | ||
1783 | return -EPERM; | ||
1784 | |||
1785 | t = *table; | ||
1786 | t.data = &state; | ||
1787 | err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); | ||
1788 | if (err < 0) | ||
1789 | return err; | ||
1790 | if (write) | ||
1791 | set_numabalancing_state(state); | ||
1792 | return err; | ||
1793 | } | ||
1794 | #endif | ||
1795 | #endif | ||
1767 | 1796 | ||
1768 | /* | 1797 | /* |
1769 | * fork()/clone()-time setup: | 1798 | * fork()/clone()-time setup: |
1770 | */ | 1799 | */ |
1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) | 1800 | int sched_fork(unsigned long clone_flags, struct task_struct *p) |
1772 | { | 1801 | { |
1773 | unsigned long flags; | 1802 | unsigned long flags; |
1774 | int cpu = get_cpu(); | 1803 | int cpu = get_cpu(); |
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1790 | * Revert to default priority/policy on fork if requested. | 1819 | * Revert to default priority/policy on fork if requested. |
1791 | */ | 1820 | */ |
1792 | if (unlikely(p->sched_reset_on_fork)) { | 1821 | if (unlikely(p->sched_reset_on_fork)) { |
1793 | if (task_has_rt_policy(p)) { | 1822 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
1794 | p->policy = SCHED_NORMAL; | 1823 | p->policy = SCHED_NORMAL; |
1795 | p->static_prio = NICE_TO_PRIO(0); | 1824 | p->static_prio = NICE_TO_PRIO(0); |
1796 | p->rt_priority = 0; | 1825 | p->rt_priority = 0; |
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1807 | p->sched_reset_on_fork = 0; | 1836 | p->sched_reset_on_fork = 0; |
1808 | } | 1837 | } |
1809 | 1838 | ||
1810 | if (!rt_prio(p->prio)) | 1839 | if (dl_prio(p->prio)) { |
1840 | put_cpu(); | ||
1841 | return -EAGAIN; | ||
1842 | } else if (rt_prio(p->prio)) { | ||
1843 | p->sched_class = &rt_sched_class; | ||
1844 | } else { | ||
1811 | p->sched_class = &fair_sched_class; | 1845 | p->sched_class = &fair_sched_class; |
1846 | } | ||
1812 | 1847 | ||
1813 | if (p->sched_class->task_fork) | 1848 | if (p->sched_class->task_fork) |
1814 | p->sched_class->task_fork(p); | 1849 | p->sched_class->task_fork(p); |
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1834 | init_task_preempt_count(p); | 1869 | init_task_preempt_count(p); |
1835 | #ifdef CONFIG_SMP | 1870 | #ifdef CONFIG_SMP |
1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1871 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
1872 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
1837 | #endif | 1873 | #endif |
1838 | 1874 | ||
1839 | put_cpu(); | 1875 | put_cpu(); |
1876 | return 0; | ||
1877 | } | ||
1878 | |||
1879 | unsigned long to_ratio(u64 period, u64 runtime) | ||
1880 | { | ||
1881 | if (runtime == RUNTIME_INF) | ||
1882 | return 1ULL << 20; | ||
1883 | |||
1884 | /* | ||
1885 | * Doing this here saves a lot of checks in all | ||
1886 | * the calling paths, and returning zero seems | ||
1887 | * safe for them anyway. | ||
1888 | */ | ||
1889 | if (period == 0) | ||
1890 | return 0; | ||
1891 | |||
1892 | return div64_u64(runtime << 20, period); | ||
1840 | } | 1893 | } |
1841 | 1894 | ||
1895 | #ifdef CONFIG_SMP | ||
1896 | inline struct dl_bw *dl_bw_of(int i) | ||
1897 | { | ||
1898 | return &cpu_rq(i)->rd->dl_bw; | ||
1899 | } | ||
1900 | |||
1901 | static inline int dl_bw_cpus(int i) | ||
1902 | { | ||
1903 | struct root_domain *rd = cpu_rq(i)->rd; | ||
1904 | int cpus = 0; | ||
1905 | |||
1906 | for_each_cpu_and(i, rd->span, cpu_active_mask) | ||
1907 | cpus++; | ||
1908 | |||
1909 | return cpus; | ||
1910 | } | ||
1911 | #else | ||
1912 | inline struct dl_bw *dl_bw_of(int i) | ||
1913 | { | ||
1914 | return &cpu_rq(i)->dl.dl_bw; | ||
1915 | } | ||
1916 | |||
1917 | static inline int dl_bw_cpus(int i) | ||
1918 | { | ||
1919 | return 1; | ||
1920 | } | ||
1921 | #endif | ||
1922 | |||
1923 | static inline | ||
1924 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
1925 | { | ||
1926 | dl_b->total_bw -= tsk_bw; | ||
1927 | } | ||
1928 | |||
1929 | static inline | ||
1930 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
1931 | { | ||
1932 | dl_b->total_bw += tsk_bw; | ||
1933 | } | ||
1934 | |||
1935 | static inline | ||
1936 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
1937 | { | ||
1938 | return dl_b->bw != -1 && | ||
1939 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
1940 | } | ||
1941 | |||
1942 | /* | ||
1943 | * We must be sure that accepting a new task (or allowing changing the | ||
1944 | * parameters of an existing one) is consistent with the bandwidth | ||
1945 | * constraints. If yes, this function also accordingly updates the currently | ||
1946 | * allocated bandwidth to reflect the new situation. | ||
1947 | * | ||
1948 | * This function is called while holding p's rq->lock. | ||
1949 | */ | ||
1950 | static int dl_overflow(struct task_struct *p, int policy, | ||
1951 | const struct sched_attr *attr) | ||
1952 | { | ||
1953 | |||
1954 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
1955 | u64 period = attr->sched_period ?: attr->sched_deadline; | ||
1956 | u64 runtime = attr->sched_runtime; | ||
1957 | u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; | ||
1958 | int cpus, err = -1; | ||
1959 | |||
1960 | if (new_bw == p->dl.dl_bw) | ||
1961 | return 0; | ||
1962 | |||
1963 | /* | ||
1964 | * Either if a task, enters, leave, or stays -deadline but changes | ||
1965 | * its parameters, we may need to update accordingly the total | ||
1966 | * allocated bandwidth of the container. | ||
1967 | */ | ||
1968 | raw_spin_lock(&dl_b->lock); | ||
1969 | cpus = dl_bw_cpus(task_cpu(p)); | ||
1970 | if (dl_policy(policy) && !task_has_dl_policy(p) && | ||
1971 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | ||
1972 | __dl_add(dl_b, new_bw); | ||
1973 | err = 0; | ||
1974 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | ||
1975 | !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { | ||
1976 | __dl_clear(dl_b, p->dl.dl_bw); | ||
1977 | __dl_add(dl_b, new_bw); | ||
1978 | err = 0; | ||
1979 | } else if (!dl_policy(policy) && task_has_dl_policy(p)) { | ||
1980 | __dl_clear(dl_b, p->dl.dl_bw); | ||
1981 | err = 0; | ||
1982 | } | ||
1983 | raw_spin_unlock(&dl_b->lock); | ||
1984 | |||
1985 | return err; | ||
1986 | } | ||
1987 | |||
1988 | extern void init_dl_bw(struct dl_bw *dl_b); | ||
1989 | |||
1842 | /* | 1990 | /* |
1843 | * wake_up_new_task - wake up a newly created task for the first time. | 1991 | * wake_up_new_task - wake up a newly created task for the first time. |
1844 | * | 1992 | * |
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2003 | if (unlikely(prev_state == TASK_DEAD)) { | 2151 | if (unlikely(prev_state == TASK_DEAD)) { |
2004 | task_numa_free(prev); | 2152 | task_numa_free(prev); |
2005 | 2153 | ||
2154 | if (prev->sched_class->task_dead) | ||
2155 | prev->sched_class->task_dead(prev); | ||
2156 | |||
2006 | /* | 2157 | /* |
2007 | * Remove function-return probe instances associated with this | 2158 | * Remove function-return probe instances associated with this |
2008 | * task and put them back on the free list. | 2159 | * task and put them back on the free list. |
@@ -2296,7 +2447,7 @@ void scheduler_tick(void) | |||
2296 | 2447 | ||
2297 | #ifdef CONFIG_SMP | 2448 | #ifdef CONFIG_SMP |
2298 | rq->idle_balance = idle_cpu(cpu); | 2449 | rq->idle_balance = idle_cpu(cpu); |
2299 | trigger_load_balance(rq, cpu); | 2450 | trigger_load_balance(rq); |
2300 | #endif | 2451 | #endif |
2301 | rq_last_tick_reset(rq); | 2452 | rq_last_tick_reset(rq); |
2302 | } | 2453 | } |
@@ -2325,7 +2476,7 @@ u64 scheduler_tick_max_deferment(void) | |||
2325 | if (time_before_eq(next, now)) | 2476 | if (time_before_eq(next, now)) |
2326 | return 0; | 2477 | return 0; |
2327 | 2478 | ||
2328 | return jiffies_to_usecs(next - now) * NSEC_PER_USEC; | 2479 | return jiffies_to_nsecs(next - now); |
2329 | } | 2480 | } |
2330 | #endif | 2481 | #endif |
2331 | 2482 | ||
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev) | |||
2414 | { | 2565 | { |
2415 | /* | 2566 | /* |
2416 | * Test if we are atomic. Since do_exit() needs to call into | 2567 | * Test if we are atomic. Since do_exit() needs to call into |
2417 | * schedule() atomically, we ignore that path for now. | 2568 | * schedule() atomically, we ignore that path. Otherwise whine |
2418 | * Otherwise, whine if we are scheduling when we should not be. | 2569 | * if we are scheduling when we should not. |
2419 | */ | 2570 | */ |
2420 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 2571 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) |
2421 | __schedule_bug(prev); | 2572 | __schedule_bug(prev); |
2422 | rcu_sleep_check(); | 2573 | rcu_sleep_check(); |
2423 | 2574 | ||
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
2761 | */ | 2912 | */ |
2762 | void rt_mutex_setprio(struct task_struct *p, int prio) | 2913 | void rt_mutex_setprio(struct task_struct *p, int prio) |
2763 | { | 2914 | { |
2764 | int oldprio, on_rq, running; | 2915 | int oldprio, on_rq, running, enqueue_flag = 0; |
2765 | struct rq *rq; | 2916 | struct rq *rq; |
2766 | const struct sched_class *prev_class; | 2917 | const struct sched_class *prev_class; |
2767 | 2918 | ||
2768 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 2919 | BUG_ON(prio > MAX_PRIO); |
2769 | 2920 | ||
2770 | rq = __task_rq_lock(p); | 2921 | rq = __task_rq_lock(p); |
2771 | 2922 | ||
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2788 | } | 2939 | } |
2789 | 2940 | ||
2790 | trace_sched_pi_setprio(p, prio); | 2941 | trace_sched_pi_setprio(p, prio); |
2942 | p->pi_top_task = rt_mutex_get_top_task(p); | ||
2791 | oldprio = p->prio; | 2943 | oldprio = p->prio; |
2792 | prev_class = p->sched_class; | 2944 | prev_class = p->sched_class; |
2793 | on_rq = p->on_rq; | 2945 | on_rq = p->on_rq; |
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2797 | if (running) | 2949 | if (running) |
2798 | p->sched_class->put_prev_task(rq, p); | 2950 | p->sched_class->put_prev_task(rq, p); |
2799 | 2951 | ||
2800 | if (rt_prio(prio)) | 2952 | /* |
2953 | * Boosting condition are: | ||
2954 | * 1. -rt task is running and holds mutex A | ||
2955 | * --> -dl task blocks on mutex A | ||
2956 | * | ||
2957 | * 2. -dl task is running and holds mutex A | ||
2958 | * --> -dl task blocks on mutex A and could preempt the | ||
2959 | * running task | ||
2960 | */ | ||
2961 | if (dl_prio(prio)) { | ||
2962 | if (!dl_prio(p->normal_prio) || (p->pi_top_task && | ||
2963 | dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { | ||
2964 | p->dl.dl_boosted = 1; | ||
2965 | p->dl.dl_throttled = 0; | ||
2966 | enqueue_flag = ENQUEUE_REPLENISH; | ||
2967 | } else | ||
2968 | p->dl.dl_boosted = 0; | ||
2969 | p->sched_class = &dl_sched_class; | ||
2970 | } else if (rt_prio(prio)) { | ||
2971 | if (dl_prio(oldprio)) | ||
2972 | p->dl.dl_boosted = 0; | ||
2973 | if (oldprio < prio) | ||
2974 | enqueue_flag = ENQUEUE_HEAD; | ||
2801 | p->sched_class = &rt_sched_class; | 2975 | p->sched_class = &rt_sched_class; |
2802 | else | 2976 | } else { |
2977 | if (dl_prio(oldprio)) | ||
2978 | p->dl.dl_boosted = 0; | ||
2803 | p->sched_class = &fair_sched_class; | 2979 | p->sched_class = &fair_sched_class; |
2980 | } | ||
2804 | 2981 | ||
2805 | p->prio = prio; | 2982 | p->prio = prio; |
2806 | 2983 | ||
2807 | if (running) | 2984 | if (running) |
2808 | p->sched_class->set_curr_task(rq); | 2985 | p->sched_class->set_curr_task(rq); |
2809 | if (on_rq) | 2986 | if (on_rq) |
2810 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 2987 | enqueue_task(rq, p, enqueue_flag); |
2811 | 2988 | ||
2812 | check_class_changed(rq, p, prev_class, oldprio); | 2989 | check_class_changed(rq, p, prev_class, oldprio); |
2813 | out_unlock: | 2990 | out_unlock: |
2814 | __task_rq_unlock(rq); | 2991 | __task_rq_unlock(rq); |
2815 | } | 2992 | } |
2816 | #endif | 2993 | #endif |
2994 | |||
2817 | void set_user_nice(struct task_struct *p, long nice) | 2995 | void set_user_nice(struct task_struct *p, long nice) |
2818 | { | 2996 | { |
2819 | int old_prio, delta, on_rq; | 2997 | int old_prio, delta, on_rq; |
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
2831 | * The RT priorities are set via sched_setscheduler(), but we still | 3009 | * The RT priorities are set via sched_setscheduler(), but we still |
2832 | * allow the 'normal' nice value to be set - but as expected | 3010 | * allow the 'normal' nice value to be set - but as expected |
2833 | * it wont have any effect on scheduling until the task is | 3011 | * it wont have any effect on scheduling until the task is |
2834 | * SCHED_FIFO/SCHED_RR: | 3012 | * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: |
2835 | */ | 3013 | */ |
2836 | if (task_has_rt_policy(p)) { | 3014 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
2837 | p->static_prio = NICE_TO_PRIO(nice); | 3015 | p->static_prio = NICE_TO_PRIO(nice); |
2838 | goto out_unlock; | 3016 | goto out_unlock; |
2839 | } | 3017 | } |
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
2988 | return pid ? find_task_by_vpid(pid) : current; | 3166 | return pid ? find_task_by_vpid(pid) : current; |
2989 | } | 3167 | } |
2990 | 3168 | ||
2991 | /* Actually do priority change: must hold rq lock. */ | 3169 | /* |
3170 | * This function initializes the sched_dl_entity of a newly becoming | ||
3171 | * SCHED_DEADLINE task. | ||
3172 | * | ||
3173 | * Only the static values are considered here, the actual runtime and the | ||
3174 | * absolute deadline will be properly calculated when the task is enqueued | ||
3175 | * for the first time with its new policy. | ||
3176 | */ | ||
2992 | static void | 3177 | static void |
2993 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 3178 | __setparam_dl(struct task_struct *p, const struct sched_attr *attr) |
2994 | { | 3179 | { |
3180 | struct sched_dl_entity *dl_se = &p->dl; | ||
3181 | |||
3182 | init_dl_task_timer(dl_se); | ||
3183 | dl_se->dl_runtime = attr->sched_runtime; | ||
3184 | dl_se->dl_deadline = attr->sched_deadline; | ||
3185 | dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; | ||
3186 | dl_se->flags = attr->sched_flags; | ||
3187 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | ||
3188 | dl_se->dl_throttled = 0; | ||
3189 | dl_se->dl_new = 1; | ||
3190 | } | ||
3191 | |||
3192 | /* Actually do priority change: must hold pi & rq lock. */ | ||
3193 | static void __setscheduler(struct rq *rq, struct task_struct *p, | ||
3194 | const struct sched_attr *attr) | ||
3195 | { | ||
3196 | int policy = attr->sched_policy; | ||
3197 | |||
3198 | if (policy == -1) /* setparam */ | ||
3199 | policy = p->policy; | ||
3200 | |||
2995 | p->policy = policy; | 3201 | p->policy = policy; |
2996 | p->rt_priority = prio; | 3202 | |
3203 | if (dl_policy(policy)) | ||
3204 | __setparam_dl(p, attr); | ||
3205 | else if (fair_policy(policy)) | ||
3206 | p->static_prio = NICE_TO_PRIO(attr->sched_nice); | ||
3207 | |||
3208 | /* | ||
3209 | * __sched_setscheduler() ensures attr->sched_priority == 0 when | ||
3210 | * !rt_policy. Always setting this ensures that things like | ||
3211 | * getparam()/getattr() don't report silly values for !rt tasks. | ||
3212 | */ | ||
3213 | p->rt_priority = attr->sched_priority; | ||
3214 | |||
2997 | p->normal_prio = normal_prio(p); | 3215 | p->normal_prio = normal_prio(p); |
2998 | /* we are holding p->pi_lock already */ | ||
2999 | p->prio = rt_mutex_getprio(p); | 3216 | p->prio = rt_mutex_getprio(p); |
3000 | if (rt_prio(p->prio)) | 3217 | |
3218 | if (dl_prio(p->prio)) | ||
3219 | p->sched_class = &dl_sched_class; | ||
3220 | else if (rt_prio(p->prio)) | ||
3001 | p->sched_class = &rt_sched_class; | 3221 | p->sched_class = &rt_sched_class; |
3002 | else | 3222 | else |
3003 | p->sched_class = &fair_sched_class; | 3223 | p->sched_class = &fair_sched_class; |
3224 | |||
3004 | set_load_weight(p); | 3225 | set_load_weight(p); |
3005 | } | 3226 | } |
3006 | 3227 | ||
3228 | static void | ||
3229 | __getparam_dl(struct task_struct *p, struct sched_attr *attr) | ||
3230 | { | ||
3231 | struct sched_dl_entity *dl_se = &p->dl; | ||
3232 | |||
3233 | attr->sched_priority = p->rt_priority; | ||
3234 | attr->sched_runtime = dl_se->dl_runtime; | ||
3235 | attr->sched_deadline = dl_se->dl_deadline; | ||
3236 | attr->sched_period = dl_se->dl_period; | ||
3237 | attr->sched_flags = dl_se->flags; | ||
3238 | } | ||
3239 | |||
3240 | /* | ||
3241 | * This function validates the new parameters of a -deadline task. | ||
3242 | * We ask for the deadline not being zero, and greater or equal | ||
3243 | * than the runtime, as well as the period of being zero or | ||
3244 | * greater than deadline. Furthermore, we have to be sure that | ||
3245 | * user parameters are above the internal resolution (1us); we | ||
3246 | * check sched_runtime only since it is always the smaller one. | ||
3247 | */ | ||
3248 | static bool | ||
3249 | __checkparam_dl(const struct sched_attr *attr) | ||
3250 | { | ||
3251 | return attr && attr->sched_deadline != 0 && | ||
3252 | (attr->sched_period == 0 || | ||
3253 | (s64)(attr->sched_period - attr->sched_deadline) >= 0) && | ||
3254 | (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && | ||
3255 | attr->sched_runtime >= (2 << (DL_SCALE - 1)); | ||
3256 | } | ||
3257 | |||
3007 | /* | 3258 | /* |
3008 | * check the target process has a UID that matches the current process's | 3259 | * check the target process has a UID that matches the current process's |
3009 | */ | 3260 | */ |
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p) | |||
3020 | return match; | 3271 | return match; |
3021 | } | 3272 | } |
3022 | 3273 | ||
3023 | static int __sched_setscheduler(struct task_struct *p, int policy, | 3274 | static int __sched_setscheduler(struct task_struct *p, |
3024 | const struct sched_param *param, bool user) | 3275 | const struct sched_attr *attr, |
3276 | bool user) | ||
3025 | { | 3277 | { |
3026 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3278 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
3279 | int policy = attr->sched_policy; | ||
3027 | unsigned long flags; | 3280 | unsigned long flags; |
3028 | const struct sched_class *prev_class; | 3281 | const struct sched_class *prev_class; |
3029 | struct rq *rq; | 3282 | struct rq *rq; |
@@ -3037,31 +3290,40 @@ recheck: | |||
3037 | reset_on_fork = p->sched_reset_on_fork; | 3290 | reset_on_fork = p->sched_reset_on_fork; |
3038 | policy = oldpolicy = p->policy; | 3291 | policy = oldpolicy = p->policy; |
3039 | } else { | 3292 | } else { |
3040 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); | 3293 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
3041 | policy &= ~SCHED_RESET_ON_FORK; | ||
3042 | 3294 | ||
3043 | if (policy != SCHED_FIFO && policy != SCHED_RR && | 3295 | if (policy != SCHED_DEADLINE && |
3296 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
3044 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 3297 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
3045 | policy != SCHED_IDLE) | 3298 | policy != SCHED_IDLE) |
3046 | return -EINVAL; | 3299 | return -EINVAL; |
3047 | } | 3300 | } |
3048 | 3301 | ||
3302 | if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) | ||
3303 | return -EINVAL; | ||
3304 | |||
3049 | /* | 3305 | /* |
3050 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3306 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
3051 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 3307 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
3052 | * SCHED_BATCH and SCHED_IDLE is 0. | 3308 | * SCHED_BATCH and SCHED_IDLE is 0. |
3053 | */ | 3309 | */ |
3054 | if (param->sched_priority < 0 || | 3310 | if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || |
3055 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3311 | (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) |
3056 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | ||
3057 | return -EINVAL; | 3312 | return -EINVAL; |
3058 | if (rt_policy(policy) != (param->sched_priority != 0)) | 3313 | if ((dl_policy(policy) && !__checkparam_dl(attr)) || |
3314 | (rt_policy(policy) != (attr->sched_priority != 0))) | ||
3059 | return -EINVAL; | 3315 | return -EINVAL; |
3060 | 3316 | ||
3061 | /* | 3317 | /* |
3062 | * Allow unprivileged RT tasks to decrease priority: | 3318 | * Allow unprivileged RT tasks to decrease priority: |
3063 | */ | 3319 | */ |
3064 | if (user && !capable(CAP_SYS_NICE)) { | 3320 | if (user && !capable(CAP_SYS_NICE)) { |
3321 | if (fair_policy(policy)) { | ||
3322 | if (attr->sched_nice < TASK_NICE(p) && | ||
3323 | !can_nice(p, attr->sched_nice)) | ||
3324 | return -EPERM; | ||
3325 | } | ||
3326 | |||
3065 | if (rt_policy(policy)) { | 3327 | if (rt_policy(policy)) { |
3066 | unsigned long rlim_rtprio = | 3328 | unsigned long rlim_rtprio = |
3067 | task_rlimit(p, RLIMIT_RTPRIO); | 3329 | task_rlimit(p, RLIMIT_RTPRIO); |
@@ -3071,11 +3333,20 @@ recheck: | |||
3071 | return -EPERM; | 3333 | return -EPERM; |
3072 | 3334 | ||
3073 | /* can't increase priority */ | 3335 | /* can't increase priority */ |
3074 | if (param->sched_priority > p->rt_priority && | 3336 | if (attr->sched_priority > p->rt_priority && |
3075 | param->sched_priority > rlim_rtprio) | 3337 | attr->sched_priority > rlim_rtprio) |
3076 | return -EPERM; | 3338 | return -EPERM; |
3077 | } | 3339 | } |
3078 | 3340 | ||
3341 | /* | ||
3342 | * Can't set/change SCHED_DEADLINE policy at all for now | ||
3343 | * (safest behavior); in the future we would like to allow | ||
3344 | * unprivileged DL tasks to increase their relative deadline | ||
3345 | * or reduce their runtime (both ways reducing utilization) | ||
3346 | */ | ||
3347 | if (dl_policy(policy)) | ||
3348 | return -EPERM; | ||
3349 | |||
3079 | /* | 3350 | /* |
3080 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 3351 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
3081 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3352 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
@@ -3120,14 +3391,21 @@ recheck: | |||
3120 | /* | 3391 | /* |
3121 | * If not changing anything there's no need to proceed further: | 3392 | * If not changing anything there's no need to proceed further: |
3122 | */ | 3393 | */ |
3123 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 3394 | if (unlikely(policy == p->policy)) { |
3124 | param->sched_priority == p->rt_priority))) { | 3395 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) |
3396 | goto change; | ||
3397 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | ||
3398 | goto change; | ||
3399 | if (dl_policy(policy)) | ||
3400 | goto change; | ||
3401 | |||
3125 | task_rq_unlock(rq, p, &flags); | 3402 | task_rq_unlock(rq, p, &flags); |
3126 | return 0; | 3403 | return 0; |
3127 | } | 3404 | } |
3405 | change: | ||
3128 | 3406 | ||
3129 | #ifdef CONFIG_RT_GROUP_SCHED | ||
3130 | if (user) { | 3407 | if (user) { |
3408 | #ifdef CONFIG_RT_GROUP_SCHED | ||
3131 | /* | 3409 | /* |
3132 | * Do not allow realtime tasks into groups that have no runtime | 3410 | * Do not allow realtime tasks into groups that have no runtime |
3133 | * assigned. | 3411 | * assigned. |
@@ -3138,8 +3416,24 @@ recheck: | |||
3138 | task_rq_unlock(rq, p, &flags); | 3416 | task_rq_unlock(rq, p, &flags); |
3139 | return -EPERM; | 3417 | return -EPERM; |
3140 | } | 3418 | } |
3141 | } | ||
3142 | #endif | 3419 | #endif |
3420 | #ifdef CONFIG_SMP | ||
3421 | if (dl_bandwidth_enabled() && dl_policy(policy)) { | ||
3422 | cpumask_t *span = rq->rd->span; | ||
3423 | |||
3424 | /* | ||
3425 | * Don't allow tasks with an affinity mask smaller than | ||
3426 | * the entire root_domain to become SCHED_DEADLINE. We | ||
3427 | * will also fail if there's no bandwidth available. | ||
3428 | */ | ||
3429 | if (!cpumask_subset(span, &p->cpus_allowed) || | ||
3430 | rq->rd->dl_bw.bw == 0) { | ||
3431 | task_rq_unlock(rq, p, &flags); | ||
3432 | return -EPERM; | ||
3433 | } | ||
3434 | } | ||
3435 | #endif | ||
3436 | } | ||
3143 | 3437 | ||
3144 | /* recheck policy now with rq lock held */ | 3438 | /* recheck policy now with rq lock held */ |
3145 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 3439 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
@@ -3147,6 +3441,17 @@ recheck: | |||
3147 | task_rq_unlock(rq, p, &flags); | 3441 | task_rq_unlock(rq, p, &flags); |
3148 | goto recheck; | 3442 | goto recheck; |
3149 | } | 3443 | } |
3444 | |||
3445 | /* | ||
3446 | * If setscheduling to SCHED_DEADLINE (or changing the parameters | ||
3447 | * of a SCHED_DEADLINE task) we need to check if enough bandwidth | ||
3448 | * is available. | ||
3449 | */ | ||
3450 | if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { | ||
3451 | task_rq_unlock(rq, p, &flags); | ||
3452 | return -EBUSY; | ||
3453 | } | ||
3454 | |||
3150 | on_rq = p->on_rq; | 3455 | on_rq = p->on_rq; |
3151 | running = task_current(rq, p); | 3456 | running = task_current(rq, p); |
3152 | if (on_rq) | 3457 | if (on_rq) |
@@ -3158,7 +3463,7 @@ recheck: | |||
3158 | 3463 | ||
3159 | oldprio = p->prio; | 3464 | oldprio = p->prio; |
3160 | prev_class = p->sched_class; | 3465 | prev_class = p->sched_class; |
3161 | __setscheduler(rq, p, policy, param->sched_priority); | 3466 | __setscheduler(rq, p, attr); |
3162 | 3467 | ||
3163 | if (running) | 3468 | if (running) |
3164 | p->sched_class->set_curr_task(rq); | 3469 | p->sched_class->set_curr_task(rq); |
@@ -3173,6 +3478,26 @@ recheck: | |||
3173 | return 0; | 3478 | return 0; |
3174 | } | 3479 | } |
3175 | 3480 | ||
3481 | static int _sched_setscheduler(struct task_struct *p, int policy, | ||
3482 | const struct sched_param *param, bool check) | ||
3483 | { | ||
3484 | struct sched_attr attr = { | ||
3485 | .sched_policy = policy, | ||
3486 | .sched_priority = param->sched_priority, | ||
3487 | .sched_nice = PRIO_TO_NICE(p->static_prio), | ||
3488 | }; | ||
3489 | |||
3490 | /* | ||
3491 | * Fixup the legacy SCHED_RESET_ON_FORK hack | ||
3492 | */ | ||
3493 | if (policy & SCHED_RESET_ON_FORK) { | ||
3494 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
3495 | policy &= ~SCHED_RESET_ON_FORK; | ||
3496 | attr.sched_policy = policy; | ||
3497 | } | ||
3498 | |||
3499 | return __sched_setscheduler(p, &attr, check); | ||
3500 | } | ||
3176 | /** | 3501 | /** |
3177 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 3502 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
3178 | * @p: the task in question. | 3503 | * @p: the task in question. |
@@ -3186,10 +3511,16 @@ recheck: | |||
3186 | int sched_setscheduler(struct task_struct *p, int policy, | 3511 | int sched_setscheduler(struct task_struct *p, int policy, |
3187 | const struct sched_param *param) | 3512 | const struct sched_param *param) |
3188 | { | 3513 | { |
3189 | return __sched_setscheduler(p, policy, param, true); | 3514 | return _sched_setscheduler(p, policy, param, true); |
3190 | } | 3515 | } |
3191 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3516 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
3192 | 3517 | ||
3518 | int sched_setattr(struct task_struct *p, const struct sched_attr *attr) | ||
3519 | { | ||
3520 | return __sched_setscheduler(p, attr, true); | ||
3521 | } | ||
3522 | EXPORT_SYMBOL_GPL(sched_setattr); | ||
3523 | |||
3193 | /** | 3524 | /** |
3194 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | 3525 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
3195 | * @p: the task in question. | 3526 | * @p: the task in question. |
@@ -3206,7 +3537,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3206 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3537 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
3207 | const struct sched_param *param) | 3538 | const struct sched_param *param) |
3208 | { | 3539 | { |
3209 | return __sched_setscheduler(p, policy, param, false); | 3540 | return _sched_setscheduler(p, policy, param, false); |
3210 | } | 3541 | } |
3211 | 3542 | ||
3212 | static int | 3543 | static int |
@@ -3231,6 +3562,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3231 | return retval; | 3562 | return retval; |
3232 | } | 3563 | } |
3233 | 3564 | ||
3565 | /* | ||
3566 | * Mimics kernel/events/core.c perf_copy_attr(). | ||
3567 | */ | ||
3568 | static int sched_copy_attr(struct sched_attr __user *uattr, | ||
3569 | struct sched_attr *attr) | ||
3570 | { | ||
3571 | u32 size; | ||
3572 | int ret; | ||
3573 | |||
3574 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | ||
3575 | return -EFAULT; | ||
3576 | |||
3577 | /* | ||
3578 | * zero the full structure, so that a short copy will be nice. | ||
3579 | */ | ||
3580 | memset(attr, 0, sizeof(*attr)); | ||
3581 | |||
3582 | ret = get_user(size, &uattr->size); | ||
3583 | if (ret) | ||
3584 | return ret; | ||
3585 | |||
3586 | if (size > PAGE_SIZE) /* silly large */ | ||
3587 | goto err_size; | ||
3588 | |||
3589 | if (!size) /* abi compat */ | ||
3590 | size = SCHED_ATTR_SIZE_VER0; | ||
3591 | |||
3592 | if (size < SCHED_ATTR_SIZE_VER0) | ||
3593 | goto err_size; | ||
3594 | |||
3595 | /* | ||
3596 | * If we're handed a bigger struct than we know of, | ||
3597 | * ensure all the unknown bits are 0 - i.e. new | ||
3598 | * user-space does not rely on any kernel feature | ||
3599 | * extensions we dont know about yet. | ||
3600 | */ | ||
3601 | if (size > sizeof(*attr)) { | ||
3602 | unsigned char __user *addr; | ||
3603 | unsigned char __user *end; | ||
3604 | unsigned char val; | ||
3605 | |||
3606 | addr = (void __user *)uattr + sizeof(*attr); | ||
3607 | end = (void __user *)uattr + size; | ||
3608 | |||
3609 | for (; addr < end; addr++) { | ||
3610 | ret = get_user(val, addr); | ||
3611 | if (ret) | ||
3612 | return ret; | ||
3613 | if (val) | ||
3614 | goto err_size; | ||
3615 | } | ||
3616 | size = sizeof(*attr); | ||
3617 | } | ||
3618 | |||
3619 | ret = copy_from_user(attr, uattr, size); | ||
3620 | if (ret) | ||
3621 | return -EFAULT; | ||
3622 | |||
3623 | /* | ||
3624 | * XXX: do we want to be lenient like existing syscalls; or do we want | ||
3625 | * to be strict and return an error on out-of-bounds values? | ||
3626 | */ | ||
3627 | attr->sched_nice = clamp(attr->sched_nice, -20, 19); | ||
3628 | |||
3629 | out: | ||
3630 | return ret; | ||
3631 | |||
3632 | err_size: | ||
3633 | put_user(sizeof(*attr), &uattr->size); | ||
3634 | ret = -E2BIG; | ||
3635 | goto out; | ||
3636 | } | ||
3637 | |||
3234 | /** | 3638 | /** |
3235 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 3639 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
3236 | * @pid: the pid in question. | 3640 | * @pid: the pid in question. |
@@ -3262,6 +3666,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
3262 | } | 3666 | } |
3263 | 3667 | ||
3264 | /** | 3668 | /** |
3669 | * sys_sched_setattr - same as above, but with extended sched_attr | ||
3670 | * @pid: the pid in question. | ||
3671 | * @uattr: structure containing the extended parameters. | ||
3672 | */ | ||
3673 | SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
3674 | unsigned int, flags) | ||
3675 | { | ||
3676 | struct sched_attr attr; | ||
3677 | struct task_struct *p; | ||
3678 | int retval; | ||
3679 | |||
3680 | if (!uattr || pid < 0 || flags) | ||
3681 | return -EINVAL; | ||
3682 | |||
3683 | if (sched_copy_attr(uattr, &attr)) | ||
3684 | return -EFAULT; | ||
3685 | |||
3686 | rcu_read_lock(); | ||
3687 | retval = -ESRCH; | ||
3688 | p = find_process_by_pid(pid); | ||
3689 | if (p != NULL) | ||
3690 | retval = sched_setattr(p, &attr); | ||
3691 | rcu_read_unlock(); | ||
3692 | |||
3693 | return retval; | ||
3694 | } | ||
3695 | |||
3696 | /** | ||
3265 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3697 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
3266 | * @pid: the pid in question. | 3698 | * @pid: the pid in question. |
3267 | * | 3699 | * |
@@ -3316,6 +3748,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
3316 | if (retval) | 3748 | if (retval) |
3317 | goto out_unlock; | 3749 | goto out_unlock; |
3318 | 3750 | ||
3751 | if (task_has_dl_policy(p)) { | ||
3752 | retval = -EINVAL; | ||
3753 | goto out_unlock; | ||
3754 | } | ||
3319 | lp.sched_priority = p->rt_priority; | 3755 | lp.sched_priority = p->rt_priority; |
3320 | rcu_read_unlock(); | 3756 | rcu_read_unlock(); |
3321 | 3757 | ||
@@ -3331,6 +3767,96 @@ out_unlock: | |||
3331 | return retval; | 3767 | return retval; |
3332 | } | 3768 | } |
3333 | 3769 | ||
3770 | static int sched_read_attr(struct sched_attr __user *uattr, | ||
3771 | struct sched_attr *attr, | ||
3772 | unsigned int usize) | ||
3773 | { | ||
3774 | int ret; | ||
3775 | |||
3776 | if (!access_ok(VERIFY_WRITE, uattr, usize)) | ||
3777 | return -EFAULT; | ||
3778 | |||
3779 | /* | ||
3780 | * If we're handed a smaller struct than we know of, | ||
3781 | * ensure all the unknown bits are 0 - i.e. old | ||
3782 | * user-space does not get uncomplete information. | ||
3783 | */ | ||
3784 | if (usize < sizeof(*attr)) { | ||
3785 | unsigned char *addr; | ||
3786 | unsigned char *end; | ||
3787 | |||
3788 | addr = (void *)attr + usize; | ||
3789 | end = (void *)attr + sizeof(*attr); | ||
3790 | |||
3791 | for (; addr < end; addr++) { | ||
3792 | if (*addr) | ||
3793 | goto err_size; | ||
3794 | } | ||
3795 | |||
3796 | attr->size = usize; | ||
3797 | } | ||
3798 | |||
3799 | ret = copy_to_user(uattr, attr, attr->size); | ||
3800 | if (ret) | ||
3801 | return -EFAULT; | ||
3802 | |||
3803 | out: | ||
3804 | return ret; | ||
3805 | |||
3806 | err_size: | ||
3807 | ret = -E2BIG; | ||
3808 | goto out; | ||
3809 | } | ||
3810 | |||
3811 | /** | ||
3812 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr | ||
3813 | * @pid: the pid in question. | ||
3814 | * @uattr: structure containing the extended parameters. | ||
3815 | * @size: sizeof(attr) for fwd/bwd comp. | ||
3816 | */ | ||
3817 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
3818 | unsigned int, size, unsigned int, flags) | ||
3819 | { | ||
3820 | struct sched_attr attr = { | ||
3821 | .size = sizeof(struct sched_attr), | ||
3822 | }; | ||
3823 | struct task_struct *p; | ||
3824 | int retval; | ||
3825 | |||
3826 | if (!uattr || pid < 0 || size > PAGE_SIZE || | ||
3827 | size < SCHED_ATTR_SIZE_VER0 || flags) | ||
3828 | return -EINVAL; | ||
3829 | |||
3830 | rcu_read_lock(); | ||
3831 | p = find_process_by_pid(pid); | ||
3832 | retval = -ESRCH; | ||
3833 | if (!p) | ||
3834 | goto out_unlock; | ||
3835 | |||
3836 | retval = security_task_getscheduler(p); | ||
3837 | if (retval) | ||
3838 | goto out_unlock; | ||
3839 | |||
3840 | attr.sched_policy = p->policy; | ||
3841 | if (p->sched_reset_on_fork) | ||
3842 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
3843 | if (task_has_dl_policy(p)) | ||
3844 | __getparam_dl(p, &attr); | ||
3845 | else if (task_has_rt_policy(p)) | ||
3846 | attr.sched_priority = p->rt_priority; | ||
3847 | else | ||
3848 | attr.sched_nice = TASK_NICE(p); | ||
3849 | |||
3850 | rcu_read_unlock(); | ||
3851 | |||
3852 | retval = sched_read_attr(uattr, &attr, size); | ||
3853 | return retval; | ||
3854 | |||
3855 | out_unlock: | ||
3856 | rcu_read_unlock(); | ||
3857 | return retval; | ||
3858 | } | ||
3859 | |||
3334 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | 3860 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
3335 | { | 3861 | { |
3336 | cpumask_var_t cpus_allowed, new_mask; | 3862 | cpumask_var_t cpus_allowed, new_mask; |
@@ -3375,8 +3901,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
3375 | if (retval) | 3901 | if (retval) |
3376 | goto out_unlock; | 3902 | goto out_unlock; |
3377 | 3903 | ||
3904 | |||
3378 | cpuset_cpus_allowed(p, cpus_allowed); | 3905 | cpuset_cpus_allowed(p, cpus_allowed); |
3379 | cpumask_and(new_mask, in_mask, cpus_allowed); | 3906 | cpumask_and(new_mask, in_mask, cpus_allowed); |
3907 | |||
3908 | /* | ||
3909 | * Since bandwidth control happens on root_domain basis, | ||
3910 | * if admission test is enabled, we only admit -deadline | ||
3911 | * tasks allowed to run on all the CPUs in the task's | ||
3912 | * root_domain. | ||
3913 | */ | ||
3914 | #ifdef CONFIG_SMP | ||
3915 | if (task_has_dl_policy(p)) { | ||
3916 | const struct cpumask *span = task_rq(p)->rd->span; | ||
3917 | |||
3918 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
3919 | retval = -EBUSY; | ||
3920 | goto out_unlock; | ||
3921 | } | ||
3922 | } | ||
3923 | #endif | ||
3380 | again: | 3924 | again: |
3381 | retval = set_cpus_allowed_ptr(p, new_mask); | 3925 | retval = set_cpus_allowed_ptr(p, new_mask); |
3382 | 3926 | ||
@@ -3653,7 +4197,7 @@ again: | |||
3653 | } | 4197 | } |
3654 | 4198 | ||
3655 | double_rq_lock(rq, p_rq); | 4199 | double_rq_lock(rq, p_rq); |
3656 | while (task_rq(p) != p_rq) { | 4200 | if (task_rq(p) != p_rq) { |
3657 | double_rq_unlock(rq, p_rq); | 4201 | double_rq_unlock(rq, p_rq); |
3658 | goto again; | 4202 | goto again; |
3659 | } | 4203 | } |
@@ -3742,6 +4286,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
3742 | case SCHED_RR: | 4286 | case SCHED_RR: |
3743 | ret = MAX_USER_RT_PRIO-1; | 4287 | ret = MAX_USER_RT_PRIO-1; |
3744 | break; | 4288 | break; |
4289 | case SCHED_DEADLINE: | ||
3745 | case SCHED_NORMAL: | 4290 | case SCHED_NORMAL: |
3746 | case SCHED_BATCH: | 4291 | case SCHED_BATCH: |
3747 | case SCHED_IDLE: | 4292 | case SCHED_IDLE: |
@@ -3768,6 +4313,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
3768 | case SCHED_RR: | 4313 | case SCHED_RR: |
3769 | ret = 1; | 4314 | ret = 1; |
3770 | break; | 4315 | break; |
4316 | case SCHED_DEADLINE: | ||
3771 | case SCHED_NORMAL: | 4317 | case SCHED_NORMAL: |
3772 | case SCHED_BATCH: | 4318 | case SCHED_BATCH: |
3773 | case SCHED_IDLE: | 4319 | case SCHED_IDLE: |
@@ -3811,7 +4357,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
3811 | goto out_unlock; | 4357 | goto out_unlock; |
3812 | 4358 | ||
3813 | rq = task_rq_lock(p, &flags); | 4359 | rq = task_rq_lock(p, &flags); |
3814 | time_slice = p->sched_class->get_rr_interval(rq, p); | 4360 | time_slice = 0; |
4361 | if (p->sched_class->get_rr_interval) | ||
4362 | time_slice = p->sched_class->get_rr_interval(rq, p); | ||
3815 | task_rq_unlock(rq, p, &flags); | 4363 | task_rq_unlock(rq, p, &flags); |
3816 | 4364 | ||
3817 | rcu_read_unlock(); | 4365 | rcu_read_unlock(); |
@@ -4090,6 +4638,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) | |||
4090 | 4638 | ||
4091 | /* TODO: This is not properly updating schedstats */ | 4639 | /* TODO: This is not properly updating schedstats */ |
4092 | 4640 | ||
4641 | trace_sched_move_numa(p, curr_cpu, target_cpu); | ||
4093 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | 4642 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); |
4094 | } | 4643 | } |
4095 | 4644 | ||
@@ -4514,13 +5063,31 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
4514 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5063 | static int sched_cpu_inactive(struct notifier_block *nfb, |
4515 | unsigned long action, void *hcpu) | 5064 | unsigned long action, void *hcpu) |
4516 | { | 5065 | { |
5066 | unsigned long flags; | ||
5067 | long cpu = (long)hcpu; | ||
5068 | |||
4517 | switch (action & ~CPU_TASKS_FROZEN) { | 5069 | switch (action & ~CPU_TASKS_FROZEN) { |
4518 | case CPU_DOWN_PREPARE: | 5070 | case CPU_DOWN_PREPARE: |
4519 | set_cpu_active((long)hcpu, false); | 5071 | set_cpu_active(cpu, false); |
5072 | |||
5073 | /* explicitly allow suspend */ | ||
5074 | if (!(action & CPU_TASKS_FROZEN)) { | ||
5075 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
5076 | bool overflow; | ||
5077 | int cpus; | ||
5078 | |||
5079 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
5080 | cpus = dl_bw_cpus(cpu); | ||
5081 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
5082 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
5083 | |||
5084 | if (overflow) | ||
5085 | return notifier_from_errno(-EBUSY); | ||
5086 | } | ||
4520 | return NOTIFY_OK; | 5087 | return NOTIFY_OK; |
4521 | default: | ||
4522 | return NOTIFY_DONE; | ||
4523 | } | 5088 | } |
5089 | |||
5090 | return NOTIFY_DONE; | ||
4524 | } | 5091 | } |
4525 | 5092 | ||
4526 | static int __init migration_init(void) | 5093 | static int __init migration_init(void) |
@@ -4739,6 +5306,8 @@ static void free_rootdomain(struct rcu_head *rcu) | |||
4739 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | 5306 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
4740 | 5307 | ||
4741 | cpupri_cleanup(&rd->cpupri); | 5308 | cpupri_cleanup(&rd->cpupri); |
5309 | cpudl_cleanup(&rd->cpudl); | ||
5310 | free_cpumask_var(rd->dlo_mask); | ||
4742 | free_cpumask_var(rd->rto_mask); | 5311 | free_cpumask_var(rd->rto_mask); |
4743 | free_cpumask_var(rd->online); | 5312 | free_cpumask_var(rd->online); |
4744 | free_cpumask_var(rd->span); | 5313 | free_cpumask_var(rd->span); |
@@ -4790,8 +5359,14 @@ static int init_rootdomain(struct root_domain *rd) | |||
4790 | goto out; | 5359 | goto out; |
4791 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 5360 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
4792 | goto free_span; | 5361 | goto free_span; |
4793 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 5362 | if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) |
4794 | goto free_online; | 5363 | goto free_online; |
5364 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5365 | goto free_dlo_mask; | ||
5366 | |||
5367 | init_dl_bw(&rd->dl_bw); | ||
5368 | if (cpudl_init(&rd->cpudl) != 0) | ||
5369 | goto free_dlo_mask; | ||
4795 | 5370 | ||
4796 | if (cpupri_init(&rd->cpupri) != 0) | 5371 | if (cpupri_init(&rd->cpupri) != 0) |
4797 | goto free_rto_mask; | 5372 | goto free_rto_mask; |
@@ -4799,6 +5374,8 @@ static int init_rootdomain(struct root_domain *rd) | |||
4799 | 5374 | ||
4800 | free_rto_mask: | 5375 | free_rto_mask: |
4801 | free_cpumask_var(rd->rto_mask); | 5376 | free_cpumask_var(rd->rto_mask); |
5377 | free_dlo_mask: | ||
5378 | free_cpumask_var(rd->dlo_mask); | ||
4802 | free_online: | 5379 | free_online: |
4803 | free_cpumask_var(rd->online); | 5380 | free_cpumask_var(rd->online); |
4804 | free_span: | 5381 | free_span: |
@@ -6150,6 +6727,7 @@ void __init sched_init_smp(void) | |||
6150 | free_cpumask_var(non_isolated_cpus); | 6727 | free_cpumask_var(non_isolated_cpus); |
6151 | 6728 | ||
6152 | init_sched_rt_class(); | 6729 | init_sched_rt_class(); |
6730 | init_sched_dl_class(); | ||
6153 | } | 6731 | } |
6154 | #else | 6732 | #else |
6155 | void __init sched_init_smp(void) | 6733 | void __init sched_init_smp(void) |
@@ -6219,13 +6797,15 @@ void __init sched_init(void) | |||
6219 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6797 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
6220 | } | 6798 | } |
6221 | 6799 | ||
6800 | init_rt_bandwidth(&def_rt_bandwidth, | ||
6801 | global_rt_period(), global_rt_runtime()); | ||
6802 | init_dl_bandwidth(&def_dl_bandwidth, | ||
6803 | global_rt_period(), global_rt_runtime()); | ||
6804 | |||
6222 | #ifdef CONFIG_SMP | 6805 | #ifdef CONFIG_SMP |
6223 | init_defrootdomain(); | 6806 | init_defrootdomain(); |
6224 | #endif | 6807 | #endif |
6225 | 6808 | ||
6226 | init_rt_bandwidth(&def_rt_bandwidth, | ||
6227 | global_rt_period(), global_rt_runtime()); | ||
6228 | |||
6229 | #ifdef CONFIG_RT_GROUP_SCHED | 6809 | #ifdef CONFIG_RT_GROUP_SCHED |
6230 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 6810 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
6231 | global_rt_period(), global_rt_runtime()); | 6811 | global_rt_period(), global_rt_runtime()); |
@@ -6249,6 +6829,7 @@ void __init sched_init(void) | |||
6249 | rq->calc_load_update = jiffies + LOAD_FREQ; | 6829 | rq->calc_load_update = jiffies + LOAD_FREQ; |
6250 | init_cfs_rq(&rq->cfs); | 6830 | init_cfs_rq(&rq->cfs); |
6251 | init_rt_rq(&rq->rt, rq); | 6831 | init_rt_rq(&rq->rt, rq); |
6832 | init_dl_rq(&rq->dl, rq); | ||
6252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6833 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6253 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 6834 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
6254 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6835 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
@@ -6320,10 +6901,6 @@ void __init sched_init(void) | |||
6320 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6901 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
6321 | #endif | 6902 | #endif |
6322 | 6903 | ||
6323 | #ifdef CONFIG_RT_MUTEXES | ||
6324 | plist_head_init(&init_task.pi_waiters); | ||
6325 | #endif | ||
6326 | |||
6327 | /* | 6904 | /* |
6328 | * The boot idle thread does lazy MMU switching as well: | 6905 | * The boot idle thread does lazy MMU switching as well: |
6329 | */ | 6906 | */ |
@@ -6397,13 +6974,16 @@ EXPORT_SYMBOL(__might_sleep); | |||
6397 | static void normalize_task(struct rq *rq, struct task_struct *p) | 6974 | static void normalize_task(struct rq *rq, struct task_struct *p) |
6398 | { | 6975 | { |
6399 | const struct sched_class *prev_class = p->sched_class; | 6976 | const struct sched_class *prev_class = p->sched_class; |
6977 | struct sched_attr attr = { | ||
6978 | .sched_policy = SCHED_NORMAL, | ||
6979 | }; | ||
6400 | int old_prio = p->prio; | 6980 | int old_prio = p->prio; |
6401 | int on_rq; | 6981 | int on_rq; |
6402 | 6982 | ||
6403 | on_rq = p->on_rq; | 6983 | on_rq = p->on_rq; |
6404 | if (on_rq) | 6984 | if (on_rq) |
6405 | dequeue_task(rq, p, 0); | 6985 | dequeue_task(rq, p, 0); |
6406 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6986 | __setscheduler(rq, p, &attr); |
6407 | if (on_rq) { | 6987 | if (on_rq) { |
6408 | enqueue_task(rq, p, 0); | 6988 | enqueue_task(rq, p, 0); |
6409 | resched_task(rq->curr); | 6989 | resched_task(rq->curr); |
@@ -6433,7 +7013,7 @@ void normalize_rt_tasks(void) | |||
6433 | p->se.statistics.block_start = 0; | 7013 | p->se.statistics.block_start = 0; |
6434 | #endif | 7014 | #endif |
6435 | 7015 | ||
6436 | if (!rt_task(p)) { | 7016 | if (!dl_task(p) && !rt_task(p)) { |
6437 | /* | 7017 | /* |
6438 | * Renice negative nice level userspace | 7018 | * Renice negative nice level userspace |
6439 | * tasks back to 0: | 7019 | * tasks back to 0: |
@@ -6628,16 +7208,6 @@ void sched_move_task(struct task_struct *tsk) | |||
6628 | } | 7208 | } |
6629 | #endif /* CONFIG_CGROUP_SCHED */ | 7209 | #endif /* CONFIG_CGROUP_SCHED */ |
6630 | 7210 | ||
6631 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | ||
6632 | static unsigned long to_ratio(u64 period, u64 runtime) | ||
6633 | { | ||
6634 | if (runtime == RUNTIME_INF) | ||
6635 | return 1ULL << 20; | ||
6636 | |||
6637 | return div64_u64(runtime << 20, period); | ||
6638 | } | ||
6639 | #endif | ||
6640 | |||
6641 | #ifdef CONFIG_RT_GROUP_SCHED | 7211 | #ifdef CONFIG_RT_GROUP_SCHED |
6642 | /* | 7212 | /* |
6643 | * Ensure that the real time constraints are schedulable. | 7213 | * Ensure that the real time constraints are schedulable. |
@@ -6811,24 +7381,13 @@ static long sched_group_rt_period(struct task_group *tg) | |||
6811 | do_div(rt_period_us, NSEC_PER_USEC); | 7381 | do_div(rt_period_us, NSEC_PER_USEC); |
6812 | return rt_period_us; | 7382 | return rt_period_us; |
6813 | } | 7383 | } |
7384 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
6814 | 7385 | ||
7386 | #ifdef CONFIG_RT_GROUP_SCHED | ||
6815 | static int sched_rt_global_constraints(void) | 7387 | static int sched_rt_global_constraints(void) |
6816 | { | 7388 | { |
6817 | u64 runtime, period; | ||
6818 | int ret = 0; | 7389 | int ret = 0; |
6819 | 7390 | ||
6820 | if (sysctl_sched_rt_period <= 0) | ||
6821 | return -EINVAL; | ||
6822 | |||
6823 | runtime = global_rt_runtime(); | ||
6824 | period = global_rt_period(); | ||
6825 | |||
6826 | /* | ||
6827 | * Sanity check on the sysctl variables. | ||
6828 | */ | ||
6829 | if (runtime > period && runtime != RUNTIME_INF) | ||
6830 | return -EINVAL; | ||
6831 | |||
6832 | mutex_lock(&rt_constraints_mutex); | 7391 | mutex_lock(&rt_constraints_mutex); |
6833 | read_lock(&tasklist_lock); | 7392 | read_lock(&tasklist_lock); |
6834 | ret = __rt_schedulable(NULL, 0, 0); | 7393 | ret = __rt_schedulable(NULL, 0, 0); |
@@ -6851,17 +7410,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | |||
6851 | static int sched_rt_global_constraints(void) | 7410 | static int sched_rt_global_constraints(void) |
6852 | { | 7411 | { |
6853 | unsigned long flags; | 7412 | unsigned long flags; |
6854 | int i; | 7413 | int i, ret = 0; |
6855 | |||
6856 | if (sysctl_sched_rt_period <= 0) | ||
6857 | return -EINVAL; | ||
6858 | |||
6859 | /* | ||
6860 | * There's always some RT tasks in the root group | ||
6861 | * -- migration, kstopmachine etc.. | ||
6862 | */ | ||
6863 | if (sysctl_sched_rt_runtime == 0) | ||
6864 | return -EBUSY; | ||
6865 | 7414 | ||
6866 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 7415 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
6867 | for_each_possible_cpu(i) { | 7416 | for_each_possible_cpu(i) { |
@@ -6873,36 +7422,91 @@ static int sched_rt_global_constraints(void) | |||
6873 | } | 7422 | } |
6874 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | 7423 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); |
6875 | 7424 | ||
6876 | return 0; | 7425 | return ret; |
6877 | } | 7426 | } |
6878 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7427 | #endif /* CONFIG_RT_GROUP_SCHED */ |
6879 | 7428 | ||
6880 | int sched_rr_handler(struct ctl_table *table, int write, | 7429 | static int sched_dl_global_constraints(void) |
6881 | void __user *buffer, size_t *lenp, | ||
6882 | loff_t *ppos) | ||
6883 | { | 7430 | { |
6884 | int ret; | 7431 | u64 runtime = global_rt_runtime(); |
6885 | static DEFINE_MUTEX(mutex); | 7432 | u64 period = global_rt_period(); |
7433 | u64 new_bw = to_ratio(period, runtime); | ||
7434 | int cpu, ret = 0; | ||
7435 | unsigned long flags; | ||
6886 | 7436 | ||
6887 | mutex_lock(&mutex); | 7437 | /* |
6888 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7438 | * Here we want to check the bandwidth not being set to some |
6889 | /* make sure that internally we keep jiffies */ | 7439 | * value smaller than the currently allocated bandwidth in |
6890 | /* also, writing zero resets timeslice to default */ | 7440 | * any of the root_domains. |
6891 | if (!ret && write) { | 7441 | * |
6892 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 7442 | * FIXME: Cycling on all the CPUs is overdoing, but simpler than |
6893 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 7443 | * cycling on root_domains... Discussion on different/better |
7444 | * solutions is welcome! | ||
7445 | */ | ||
7446 | for_each_possible_cpu(cpu) { | ||
7447 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
7448 | |||
7449 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7450 | if (new_bw < dl_b->total_bw) | ||
7451 | ret = -EBUSY; | ||
7452 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7453 | |||
7454 | if (ret) | ||
7455 | break; | ||
6894 | } | 7456 | } |
6895 | mutex_unlock(&mutex); | 7457 | |
6896 | return ret; | 7458 | return ret; |
6897 | } | 7459 | } |
6898 | 7460 | ||
7461 | static void sched_dl_do_global(void) | ||
7462 | { | ||
7463 | u64 new_bw = -1; | ||
7464 | int cpu; | ||
7465 | unsigned long flags; | ||
7466 | |||
7467 | def_dl_bandwidth.dl_period = global_rt_period(); | ||
7468 | def_dl_bandwidth.dl_runtime = global_rt_runtime(); | ||
7469 | |||
7470 | if (global_rt_runtime() != RUNTIME_INF) | ||
7471 | new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
7472 | |||
7473 | /* | ||
7474 | * FIXME: As above... | ||
7475 | */ | ||
7476 | for_each_possible_cpu(cpu) { | ||
7477 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
7478 | |||
7479 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7480 | dl_b->bw = new_bw; | ||
7481 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7482 | } | ||
7483 | } | ||
7484 | |||
7485 | static int sched_rt_global_validate(void) | ||
7486 | { | ||
7487 | if (sysctl_sched_rt_period <= 0) | ||
7488 | return -EINVAL; | ||
7489 | |||
7490 | if ((sysctl_sched_rt_runtime != RUNTIME_INF) && | ||
7491 | (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) | ||
7492 | return -EINVAL; | ||
7493 | |||
7494 | return 0; | ||
7495 | } | ||
7496 | |||
7497 | static void sched_rt_do_global(void) | ||
7498 | { | ||
7499 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
7500 | def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); | ||
7501 | } | ||
7502 | |||
6899 | int sched_rt_handler(struct ctl_table *table, int write, | 7503 | int sched_rt_handler(struct ctl_table *table, int write, |
6900 | void __user *buffer, size_t *lenp, | 7504 | void __user *buffer, size_t *lenp, |
6901 | loff_t *ppos) | 7505 | loff_t *ppos) |
6902 | { | 7506 | { |
6903 | int ret; | ||
6904 | int old_period, old_runtime; | 7507 | int old_period, old_runtime; |
6905 | static DEFINE_MUTEX(mutex); | 7508 | static DEFINE_MUTEX(mutex); |
7509 | int ret; | ||
6906 | 7510 | ||
6907 | mutex_lock(&mutex); | 7511 | mutex_lock(&mutex); |
6908 | old_period = sysctl_sched_rt_period; | 7512 | old_period = sysctl_sched_rt_period; |
@@ -6911,21 +7515,50 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
6911 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7515 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
6912 | 7516 | ||
6913 | if (!ret && write) { | 7517 | if (!ret && write) { |
7518 | ret = sched_rt_global_validate(); | ||
7519 | if (ret) | ||
7520 | goto undo; | ||
7521 | |||
6914 | ret = sched_rt_global_constraints(); | 7522 | ret = sched_rt_global_constraints(); |
6915 | if (ret) { | 7523 | if (ret) |
6916 | sysctl_sched_rt_period = old_period; | 7524 | goto undo; |
6917 | sysctl_sched_rt_runtime = old_runtime; | 7525 | |
6918 | } else { | 7526 | ret = sched_dl_global_constraints(); |
6919 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | 7527 | if (ret) |
6920 | def_rt_bandwidth.rt_period = | 7528 | goto undo; |
6921 | ns_to_ktime(global_rt_period()); | 7529 | |
6922 | } | 7530 | sched_rt_do_global(); |
7531 | sched_dl_do_global(); | ||
7532 | } | ||
7533 | if (0) { | ||
7534 | undo: | ||
7535 | sysctl_sched_rt_period = old_period; | ||
7536 | sysctl_sched_rt_runtime = old_runtime; | ||
6923 | } | 7537 | } |
6924 | mutex_unlock(&mutex); | 7538 | mutex_unlock(&mutex); |
6925 | 7539 | ||
6926 | return ret; | 7540 | return ret; |
6927 | } | 7541 | } |
6928 | 7542 | ||
7543 | int sched_rr_handler(struct ctl_table *table, int write, | ||
7544 | void __user *buffer, size_t *lenp, | ||
7545 | loff_t *ppos) | ||
7546 | { | ||
7547 | int ret; | ||
7548 | static DEFINE_MUTEX(mutex); | ||
7549 | |||
7550 | mutex_lock(&mutex); | ||
7551 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
7552 | /* make sure that internally we keep jiffies */ | ||
7553 | /* also, writing zero resets timeslice to default */ | ||
7554 | if (!ret && write) { | ||
7555 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | ||
7556 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | ||
7557 | } | ||
7558 | mutex_unlock(&mutex); | ||
7559 | return ret; | ||
7560 | } | ||
7561 | |||
6929 | #ifdef CONFIG_CGROUP_SCHED | 7562 | #ifdef CONFIG_CGROUP_SCHED |
6930 | 7563 | ||
6931 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) | 7564 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
@@ -7258,15 +7891,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | |||
7258 | return ret; | 7891 | return ret; |
7259 | } | 7892 | } |
7260 | 7893 | ||
7261 | static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, | 7894 | static int cpu_stats_show(struct seq_file *sf, void *v) |
7262 | struct cgroup_map_cb *cb) | ||
7263 | { | 7895 | { |
7264 | struct task_group *tg = css_tg(css); | 7896 | struct task_group *tg = css_tg(seq_css(sf)); |
7265 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | 7897 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
7266 | 7898 | ||
7267 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7899 | seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); |
7268 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | 7900 | seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); |
7269 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | 7901 | seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); |
7270 | 7902 | ||
7271 | return 0; | 7903 | return 0; |
7272 | } | 7904 | } |
@@ -7320,7 +7952,7 @@ static struct cftype cpu_files[] = { | |||
7320 | }, | 7952 | }, |
7321 | { | 7953 | { |
7322 | .name = "stat", | 7954 | .name = "stat", |
7323 | .read_map = cpu_stats_show, | 7955 | .seq_show = cpu_stats_show, |
7324 | }, | 7956 | }, |
7325 | #endif | 7957 | #endif |
7326 | #ifdef CONFIG_RT_GROUP_SCHED | 7958 | #ifdef CONFIG_RT_GROUP_SCHED |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722ff0299..622e0818f905 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -163,10 +163,9 @@ out: | |||
163 | return err; | 163 | return err; |
164 | } | 164 | } |
165 | 165 | ||
166 | static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, | 166 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) |
167 | struct cftype *cft, struct seq_file *m) | ||
168 | { | 167 | { |
169 | struct cpuacct *ca = css_ca(css); | 168 | struct cpuacct *ca = css_ca(seq_css(m)); |
170 | u64 percpu; | 169 | u64 percpu; |
171 | int i; | 170 | int i; |
172 | 171 | ||
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = { | |||
183 | [CPUACCT_STAT_SYSTEM] = "system", | 182 | [CPUACCT_STAT_SYSTEM] = "system", |
184 | }; | 183 | }; |
185 | 184 | ||
186 | static int cpuacct_stats_show(struct cgroup_subsys_state *css, | 185 | static int cpuacct_stats_show(struct seq_file *sf, void *v) |
187 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
188 | { | 186 | { |
189 | struct cpuacct *ca = css_ca(css); | 187 | struct cpuacct *ca = css_ca(seq_css(sf)); |
190 | int cpu; | 188 | int cpu; |
191 | s64 val = 0; | 189 | s64 val = 0; |
192 | 190 | ||
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, | |||
196 | val += kcpustat->cpustat[CPUTIME_NICE]; | 194 | val += kcpustat->cpustat[CPUTIME_NICE]; |
197 | } | 195 | } |
198 | val = cputime64_to_clock_t(val); | 196 | val = cputime64_to_clock_t(val); |
199 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | 197 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); |
200 | 198 | ||
201 | val = 0; | 199 | val = 0; |
202 | for_each_online_cpu(cpu) { | 200 | for_each_online_cpu(cpu) { |
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, | |||
207 | } | 205 | } |
208 | 206 | ||
209 | val = cputime64_to_clock_t(val); | 207 | val = cputime64_to_clock_t(val); |
210 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | 208 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); |
211 | 209 | ||
212 | return 0; | 210 | return 0; |
213 | } | 211 | } |
@@ -220,11 +218,11 @@ static struct cftype files[] = { | |||
220 | }, | 218 | }, |
221 | { | 219 | { |
222 | .name = "usage_percpu", | 220 | .name = "usage_percpu", |
223 | .read_seq_string = cpuacct_percpu_seq_read, | 221 | .seq_show = cpuacct_percpu_seq_show, |
224 | }, | 222 | }, |
225 | { | 223 | { |
226 | .name = "stat", | 224 | .name = "stat", |
227 | .read_map = cpuacct_stats_show, | 225 | .seq_show = cpuacct_stats_show, |
228 | }, | 226 | }, |
229 | { } /* terminate */ | 227 | { } /* terminate */ |
230 | }; | 228 | }; |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..5b9bb42b2d47 --- /dev/null +++ b/kernel/sched/cpudeadline.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * kernel/sched/cpudl.c | ||
3 | * | ||
4 | * Global CPU deadline management | ||
5 | * | ||
6 | * Author: Juri Lelli <j.lelli@sssup.it> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; version 2 | ||
11 | * of the License. | ||
12 | */ | ||
13 | |||
14 | #include <linux/gfp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include "cpudeadline.h" | ||
17 | |||
18 | static inline int parent(int i) | ||
19 | { | ||
20 | return (i - 1) >> 1; | ||
21 | } | ||
22 | |||
23 | static inline int left_child(int i) | ||
24 | { | ||
25 | return (i << 1) + 1; | ||
26 | } | ||
27 | |||
28 | static inline int right_child(int i) | ||
29 | { | ||
30 | return (i << 1) + 2; | ||
31 | } | ||
32 | |||
33 | static inline int dl_time_before(u64 a, u64 b) | ||
34 | { | ||
35 | return (s64)(a - b) < 0; | ||
36 | } | ||
37 | |||
38 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | ||
39 | { | ||
40 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | ||
41 | |||
42 | swap(cp->elements[a], cp->elements[b]); | ||
43 | swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); | ||
44 | } | ||
45 | |||
46 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
47 | { | ||
48 | int l, r, largest; | ||
49 | |||
50 | /* adapted from lib/prio_heap.c */ | ||
51 | while(1) { | ||
52 | l = left_child(idx); | ||
53 | r = right_child(idx); | ||
54 | largest = idx; | ||
55 | |||
56 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | ||
57 | cp->elements[l].dl)) | ||
58 | largest = l; | ||
59 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | ||
60 | cp->elements[r].dl)) | ||
61 | largest = r; | ||
62 | if (largest == idx) | ||
63 | break; | ||
64 | |||
65 | /* Push idx down the heap one level and bump one up */ | ||
66 | cpudl_exchange(cp, largest, idx); | ||
67 | idx = largest; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | ||
72 | { | ||
73 | WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); | ||
74 | |||
75 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | ||
76 | cp->elements[idx].dl = new_dl; | ||
77 | cpudl_heapify(cp, idx); | ||
78 | } else { | ||
79 | cp->elements[idx].dl = new_dl; | ||
80 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
81 | cp->elements[idx].dl)) { | ||
82 | cpudl_exchange(cp, idx, parent(idx)); | ||
83 | idx = parent(idx); | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | static inline int cpudl_maximum(struct cpudl *cp) | ||
89 | { | ||
90 | return cp->elements[0].cpu; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * cpudl_find - find the best (later-dl) CPU in the system | ||
95 | * @cp: the cpudl max-heap context | ||
96 | * @p: the task | ||
97 | * @later_mask: a mask to fill in with the selected CPUs (or NULL) | ||
98 | * | ||
99 | * Returns: int - best CPU (heap maximum if suitable) | ||
100 | */ | ||
101 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
102 | struct cpumask *later_mask) | ||
103 | { | ||
104 | int best_cpu = -1; | ||
105 | const struct sched_dl_entity *dl_se = &p->dl; | ||
106 | |||
107 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | ||
108 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
109 | later_mask, cpu_active_mask)) { | ||
110 | best_cpu = cpumask_any(later_mask); | ||
111 | goto out; | ||
112 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | ||
113 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | ||
114 | best_cpu = cpudl_maximum(cp); | ||
115 | if (later_mask) | ||
116 | cpumask_set_cpu(best_cpu, later_mask); | ||
117 | } | ||
118 | |||
119 | out: | ||
120 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | ||
121 | |||
122 | return best_cpu; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * cpudl_set - update the cpudl max-heap | ||
127 | * @cp: the cpudl max-heap context | ||
128 | * @cpu: the target cpu | ||
129 | * @dl: the new earliest deadline for this cpu | ||
130 | * | ||
131 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
132 | * | ||
133 | * Returns: (void) | ||
134 | */ | ||
135 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | ||
136 | { | ||
137 | int old_idx, new_cpu; | ||
138 | unsigned long flags; | ||
139 | |||
140 | WARN_ON(!cpu_present(cpu)); | ||
141 | |||
142 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
143 | old_idx = cp->cpu_to_idx[cpu]; | ||
144 | if (!is_valid) { | ||
145 | /* remove item */ | ||
146 | if (old_idx == IDX_INVALID) { | ||
147 | /* | ||
148 | * Nothing to remove if old_idx was invalid. | ||
149 | * This could happen if a rq_offline_dl is | ||
150 | * called for a CPU without -dl tasks running. | ||
151 | */ | ||
152 | goto out; | ||
153 | } | ||
154 | new_cpu = cp->elements[cp->size - 1].cpu; | ||
155 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | ||
156 | cp->elements[old_idx].cpu = new_cpu; | ||
157 | cp->size--; | ||
158 | cp->cpu_to_idx[new_cpu] = old_idx; | ||
159 | cp->cpu_to_idx[cpu] = IDX_INVALID; | ||
160 | while (old_idx > 0 && dl_time_before( | ||
161 | cp->elements[parent(old_idx)].dl, | ||
162 | cp->elements[old_idx].dl)) { | ||
163 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
164 | old_idx = parent(old_idx); | ||
165 | } | ||
166 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
167 | cpudl_heapify(cp, old_idx); | ||
168 | |||
169 | goto out; | ||
170 | } | ||
171 | |||
172 | if (old_idx == IDX_INVALID) { | ||
173 | cp->size++; | ||
174 | cp->elements[cp->size - 1].dl = 0; | ||
175 | cp->elements[cp->size - 1].cpu = cpu; | ||
176 | cp->cpu_to_idx[cpu] = cp->size - 1; | ||
177 | cpudl_change_key(cp, cp->size - 1, dl); | ||
178 | cpumask_clear_cpu(cpu, cp->free_cpus); | ||
179 | } else { | ||
180 | cpudl_change_key(cp, old_idx, dl); | ||
181 | } | ||
182 | |||
183 | out: | ||
184 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * cpudl_init - initialize the cpudl structure | ||
189 | * @cp: the cpudl max-heap context | ||
190 | */ | ||
191 | int cpudl_init(struct cpudl *cp) | ||
192 | { | ||
193 | int i; | ||
194 | |||
195 | memset(cp, 0, sizeof(*cp)); | ||
196 | raw_spin_lock_init(&cp->lock); | ||
197 | cp->size = 0; | ||
198 | for (i = 0; i < NR_CPUS; i++) | ||
199 | cp->cpu_to_idx[i] = IDX_INVALID; | ||
200 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) | ||
201 | return -ENOMEM; | ||
202 | cpumask_setall(cp->free_cpus); | ||
203 | |||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * cpudl_cleanup - clean up the cpudl structure | ||
209 | * @cp: the cpudl max-heap context | ||
210 | */ | ||
211 | void cpudl_cleanup(struct cpudl *cp) | ||
212 | { | ||
213 | /* | ||
214 | * nothing to do for the moment | ||
215 | */ | ||
216 | } | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h | |||
@@ -0,0 +1,33 @@ | |||
1 | #ifndef _LINUX_CPUDL_H | ||
2 | #define _LINUX_CPUDL_H | ||
3 | |||
4 | #include <linux/sched.h> | ||
5 | |||
6 | #define IDX_INVALID -1 | ||
7 | |||
8 | struct array_item { | ||
9 | u64 dl; | ||
10 | int cpu; | ||
11 | }; | ||
12 | |||
13 | struct cpudl { | ||
14 | raw_spinlock_t lock; | ||
15 | int size; | ||
16 | int cpu_to_idx[NR_CPUS]; | ||
17 | struct array_item elements[NR_CPUS]; | ||
18 | cpumask_var_t free_cpus; | ||
19 | }; | ||
20 | |||
21 | |||
22 | #ifdef CONFIG_SMP | ||
23 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
24 | struct cpumask *later_mask); | ||
25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | ||
26 | int cpudl_init(struct cpudl *cp); | ||
27 | void cpudl_cleanup(struct cpudl *cp); | ||
28 | #else | ||
29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
30 | #define cpudl_init() do { } while (0) | ||
31 | #endif /* CONFIG_SMP */ | ||
32 | |||
33 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..6e79b3faa4cd --- /dev/null +++ b/kernel/sched/deadline.c | |||
@@ -0,0 +1,1639 @@ | |||
1 | /* | ||
2 | * Deadline Scheduling Class (SCHED_DEADLINE) | ||
3 | * | ||
4 | * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). | ||
5 | * | ||
6 | * Tasks that periodically executes their instances for less than their | ||
7 | * runtime won't miss any of their deadlines. | ||
8 | * Tasks that are not periodic or sporadic or that tries to execute more | ||
9 | * than their reserved bandwidth will be slowed down (and may potentially | ||
10 | * miss some of their deadlines), and won't affect any other task. | ||
11 | * | ||
12 | * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, | ||
13 | * Juri Lelli <juri.lelli@gmail.com>, | ||
14 | * Michael Trimarchi <michael@amarulasolutions.com>, | ||
15 | * Fabio Checconi <fchecconi@gmail.com> | ||
16 | */ | ||
17 | #include "sched.h" | ||
18 | |||
19 | #include <linux/slab.h> | ||
20 | |||
21 | struct dl_bandwidth def_dl_bandwidth; | ||
22 | |||
23 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | ||
24 | { | ||
25 | return container_of(dl_se, struct task_struct, dl); | ||
26 | } | ||
27 | |||
28 | static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) | ||
29 | { | ||
30 | return container_of(dl_rq, struct rq, dl); | ||
31 | } | ||
32 | |||
33 | static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) | ||
34 | { | ||
35 | struct task_struct *p = dl_task_of(dl_se); | ||
36 | struct rq *rq = task_rq(p); | ||
37 | |||
38 | return &rq->dl; | ||
39 | } | ||
40 | |||
41 | static inline int on_dl_rq(struct sched_dl_entity *dl_se) | ||
42 | { | ||
43 | return !RB_EMPTY_NODE(&dl_se->rb_node); | ||
44 | } | ||
45 | |||
46 | static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) | ||
47 | { | ||
48 | struct sched_dl_entity *dl_se = &p->dl; | ||
49 | |||
50 | return dl_rq->rb_leftmost == &dl_se->rb_node; | ||
51 | } | ||
52 | |||
53 | void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) | ||
54 | { | ||
55 | raw_spin_lock_init(&dl_b->dl_runtime_lock); | ||
56 | dl_b->dl_period = period; | ||
57 | dl_b->dl_runtime = runtime; | ||
58 | } | ||
59 | |||
60 | extern unsigned long to_ratio(u64 period, u64 runtime); | ||
61 | |||
62 | void init_dl_bw(struct dl_bw *dl_b) | ||
63 | { | ||
64 | raw_spin_lock_init(&dl_b->lock); | ||
65 | raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); | ||
66 | if (global_rt_runtime() == RUNTIME_INF) | ||
67 | dl_b->bw = -1; | ||
68 | else | ||
69 | dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
70 | raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); | ||
71 | dl_b->total_bw = 0; | ||
72 | } | ||
73 | |||
74 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | ||
75 | { | ||
76 | dl_rq->rb_root = RB_ROOT; | ||
77 | |||
78 | #ifdef CONFIG_SMP | ||
79 | /* zero means no -deadline tasks */ | ||
80 | dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; | ||
81 | |||
82 | dl_rq->dl_nr_migratory = 0; | ||
83 | dl_rq->overloaded = 0; | ||
84 | dl_rq->pushable_dl_tasks_root = RB_ROOT; | ||
85 | #else | ||
86 | init_dl_bw(&dl_rq->dl_bw); | ||
87 | #endif | ||
88 | } | ||
89 | |||
90 | #ifdef CONFIG_SMP | ||
91 | |||
92 | static inline int dl_overloaded(struct rq *rq) | ||
93 | { | ||
94 | return atomic_read(&rq->rd->dlo_count); | ||
95 | } | ||
96 | |||
97 | static inline void dl_set_overload(struct rq *rq) | ||
98 | { | ||
99 | if (!rq->online) | ||
100 | return; | ||
101 | |||
102 | cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); | ||
103 | /* | ||
104 | * Must be visible before the overload count is | ||
105 | * set (as in sched_rt.c). | ||
106 | * | ||
107 | * Matched by the barrier in pull_dl_task(). | ||
108 | */ | ||
109 | smp_wmb(); | ||
110 | atomic_inc(&rq->rd->dlo_count); | ||
111 | } | ||
112 | |||
113 | static inline void dl_clear_overload(struct rq *rq) | ||
114 | { | ||
115 | if (!rq->online) | ||
116 | return; | ||
117 | |||
118 | atomic_dec(&rq->rd->dlo_count); | ||
119 | cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); | ||
120 | } | ||
121 | |||
122 | static void update_dl_migration(struct dl_rq *dl_rq) | ||
123 | { | ||
124 | if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { | ||
125 | if (!dl_rq->overloaded) { | ||
126 | dl_set_overload(rq_of_dl_rq(dl_rq)); | ||
127 | dl_rq->overloaded = 1; | ||
128 | } | ||
129 | } else if (dl_rq->overloaded) { | ||
130 | dl_clear_overload(rq_of_dl_rq(dl_rq)); | ||
131 | dl_rq->overloaded = 0; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
136 | { | ||
137 | struct task_struct *p = dl_task_of(dl_se); | ||
138 | |||
139 | if (p->nr_cpus_allowed > 1) | ||
140 | dl_rq->dl_nr_migratory++; | ||
141 | |||
142 | update_dl_migration(dl_rq); | ||
143 | } | ||
144 | |||
145 | static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
146 | { | ||
147 | struct task_struct *p = dl_task_of(dl_se); | ||
148 | |||
149 | if (p->nr_cpus_allowed > 1) | ||
150 | dl_rq->dl_nr_migratory--; | ||
151 | |||
152 | update_dl_migration(dl_rq); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * The list of pushable -deadline task is not a plist, like in | ||
157 | * sched_rt.c, it is an rb-tree with tasks ordered by deadline. | ||
158 | */ | ||
159 | static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
160 | { | ||
161 | struct dl_rq *dl_rq = &rq->dl; | ||
162 | struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; | ||
163 | struct rb_node *parent = NULL; | ||
164 | struct task_struct *entry; | ||
165 | int leftmost = 1; | ||
166 | |||
167 | BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); | ||
168 | |||
169 | while (*link) { | ||
170 | parent = *link; | ||
171 | entry = rb_entry(parent, struct task_struct, | ||
172 | pushable_dl_tasks); | ||
173 | if (dl_entity_preempt(&p->dl, &entry->dl)) | ||
174 | link = &parent->rb_left; | ||
175 | else { | ||
176 | link = &parent->rb_right; | ||
177 | leftmost = 0; | ||
178 | } | ||
179 | } | ||
180 | |||
181 | if (leftmost) | ||
182 | dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; | ||
183 | |||
184 | rb_link_node(&p->pushable_dl_tasks, parent, link); | ||
185 | rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
186 | } | ||
187 | |||
188 | static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
189 | { | ||
190 | struct dl_rq *dl_rq = &rq->dl; | ||
191 | |||
192 | if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) | ||
193 | return; | ||
194 | |||
195 | if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { | ||
196 | struct rb_node *next_node; | ||
197 | |||
198 | next_node = rb_next(&p->pushable_dl_tasks); | ||
199 | dl_rq->pushable_dl_tasks_leftmost = next_node; | ||
200 | } | ||
201 | |||
202 | rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
203 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
204 | } | ||
205 | |||
206 | static inline int has_pushable_dl_tasks(struct rq *rq) | ||
207 | { | ||
208 | return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); | ||
209 | } | ||
210 | |||
211 | static int push_dl_task(struct rq *rq); | ||
212 | |||
213 | #else | ||
214 | |||
215 | static inline | ||
216 | void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
217 | { | ||
218 | } | ||
219 | |||
220 | static inline | ||
221 | void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
222 | { | ||
223 | } | ||
224 | |||
225 | static inline | ||
226 | void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
227 | { | ||
228 | } | ||
229 | |||
230 | static inline | ||
231 | void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
232 | { | ||
233 | } | ||
234 | |||
235 | #endif /* CONFIG_SMP */ | ||
236 | |||
237 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
238 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
239 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
240 | int flags); | ||
241 | |||
242 | /* | ||
243 | * We are being explicitly informed that a new instance is starting, | ||
244 | * and this means that: | ||
245 | * - the absolute deadline of the entity has to be placed at | ||
246 | * current time + relative deadline; | ||
247 | * - the runtime of the entity has to be set to the maximum value. | ||
248 | * | ||
249 | * The capability of specifying such event is useful whenever a -deadline | ||
250 | * entity wants to (try to!) synchronize its behaviour with the scheduler's | ||
251 | * one, and to (try to!) reconcile itself with its own scheduling | ||
252 | * parameters. | ||
253 | */ | ||
254 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | ||
255 | struct sched_dl_entity *pi_se) | ||
256 | { | ||
257 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
258 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
259 | |||
260 | WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); | ||
261 | |||
262 | /* | ||
263 | * We use the regular wall clock time to set deadlines in the | ||
264 | * future; in fact, we must consider execution overheads (time | ||
265 | * spent on hardirq context, etc.). | ||
266 | */ | ||
267 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
268 | dl_se->runtime = pi_se->dl_runtime; | ||
269 | dl_se->dl_new = 0; | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * Pure Earliest Deadline First (EDF) scheduling does not deal with the | ||
274 | * possibility of a entity lasting more than what it declared, and thus | ||
275 | * exhausting its runtime. | ||
276 | * | ||
277 | * Here we are interested in making runtime overrun possible, but we do | ||
278 | * not want a entity which is misbehaving to affect the scheduling of all | ||
279 | * other entities. | ||
280 | * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) | ||
281 | * is used, in order to confine each entity within its own bandwidth. | ||
282 | * | ||
283 | * This function deals exactly with that, and ensures that when the runtime | ||
284 | * of a entity is replenished, its deadline is also postponed. That ensures | ||
285 | * the overrunning entity can't interfere with other entity in the system and | ||
286 | * can't make them miss their deadlines. Reasons why this kind of overruns | ||
287 | * could happen are, typically, a entity voluntarily trying to overcome its | ||
288 | * runtime, or it just underestimated it during sched_setscheduler_ex(). | ||
289 | */ | ||
290 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, | ||
291 | struct sched_dl_entity *pi_se) | ||
292 | { | ||
293 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
294 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
295 | |||
296 | BUG_ON(pi_se->dl_runtime <= 0); | ||
297 | |||
298 | /* | ||
299 | * This could be the case for a !-dl task that is boosted. | ||
300 | * Just go with full inherited parameters. | ||
301 | */ | ||
302 | if (dl_se->dl_deadline == 0) { | ||
303 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
304 | dl_se->runtime = pi_se->dl_runtime; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * We keep moving the deadline away until we get some | ||
309 | * available runtime for the entity. This ensures correct | ||
310 | * handling of situations where the runtime overrun is | ||
311 | * arbitrary large. | ||
312 | */ | ||
313 | while (dl_se->runtime <= 0) { | ||
314 | dl_se->deadline += pi_se->dl_period; | ||
315 | dl_se->runtime += pi_se->dl_runtime; | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * At this point, the deadline really should be "in | ||
320 | * the future" with respect to rq->clock. If it's | ||
321 | * not, we are, for some reason, lagging too much! | ||
322 | * Anyway, after having warn userspace abut that, | ||
323 | * we still try to keep the things running by | ||
324 | * resetting the deadline and the budget of the | ||
325 | * entity. | ||
326 | */ | ||
327 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | ||
328 | static bool lag_once = false; | ||
329 | |||
330 | if (!lag_once) { | ||
331 | lag_once = true; | ||
332 | printk_sched("sched: DL replenish lagged to much\n"); | ||
333 | } | ||
334 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
335 | dl_se->runtime = pi_se->dl_runtime; | ||
336 | } | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Here we check if --at time t-- an entity (which is probably being | ||
341 | * [re]activated or, in general, enqueued) can use its remaining runtime | ||
342 | * and its current deadline _without_ exceeding the bandwidth it is | ||
343 | * assigned (function returns true if it can't). We are in fact applying | ||
344 | * one of the CBS rules: when a task wakes up, if the residual runtime | ||
345 | * over residual deadline fits within the allocated bandwidth, then we | ||
346 | * can keep the current (absolute) deadline and residual budget without | ||
347 | * disrupting the schedulability of the system. Otherwise, we should | ||
348 | * refill the runtime and set the deadline a period in the future, | ||
349 | * because keeping the current (absolute) deadline of the task would | ||
350 | * result in breaking guarantees promised to other tasks (refer to | ||
351 | * Documentation/scheduler/sched-deadline.txt for more informations). | ||
352 | * | ||
353 | * This function returns true if: | ||
354 | * | ||
355 | * runtime / (deadline - t) > dl_runtime / dl_period , | ||
356 | * | ||
357 | * IOW we can't recycle current parameters. | ||
358 | * | ||
359 | * Notice that the bandwidth check is done against the period. For | ||
360 | * task with deadline equal to period this is the same of using | ||
361 | * dl_deadline instead of dl_period in the equation above. | ||
362 | */ | ||
363 | static bool dl_entity_overflow(struct sched_dl_entity *dl_se, | ||
364 | struct sched_dl_entity *pi_se, u64 t) | ||
365 | { | ||
366 | u64 left, right; | ||
367 | |||
368 | /* | ||
369 | * left and right are the two sides of the equation above, | ||
370 | * after a bit of shuffling to use multiplications instead | ||
371 | * of divisions. | ||
372 | * | ||
373 | * Note that none of the time values involved in the two | ||
374 | * multiplications are absolute: dl_deadline and dl_runtime | ||
375 | * are the relative deadline and the maximum runtime of each | ||
376 | * instance, runtime is the runtime left for the last instance | ||
377 | * and (deadline - t), since t is rq->clock, is the time left | ||
378 | * to the (absolute) deadline. Even if overflowing the u64 type | ||
379 | * is very unlikely to occur in both cases, here we scale down | ||
380 | * as we want to avoid that risk at all. Scaling down by 10 | ||
381 | * means that we reduce granularity to 1us. We are fine with it, | ||
382 | * since this is only a true/false check and, anyway, thinking | ||
383 | * of anything below microseconds resolution is actually fiction | ||
384 | * (but still we want to give the user that illusion >;). | ||
385 | */ | ||
386 | left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); | ||
387 | right = ((dl_se->deadline - t) >> DL_SCALE) * | ||
388 | (pi_se->dl_runtime >> DL_SCALE); | ||
389 | |||
390 | return dl_time_before(right, left); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * When a -deadline entity is queued back on the runqueue, its runtime and | ||
395 | * deadline might need updating. | ||
396 | * | ||
397 | * The policy here is that we update the deadline of the entity only if: | ||
398 | * - the current deadline is in the past, | ||
399 | * - using the remaining runtime with the current deadline would make | ||
400 | * the entity exceed its bandwidth. | ||
401 | */ | ||
402 | static void update_dl_entity(struct sched_dl_entity *dl_se, | ||
403 | struct sched_dl_entity *pi_se) | ||
404 | { | ||
405 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
406 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
407 | |||
408 | /* | ||
409 | * The arrival of a new instance needs special treatment, i.e., | ||
410 | * the actual scheduling parameters have to be "renewed". | ||
411 | */ | ||
412 | if (dl_se->dl_new) { | ||
413 | setup_new_dl_entity(dl_se, pi_se); | ||
414 | return; | ||
415 | } | ||
416 | |||
417 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | ||
418 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | ||
419 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
420 | dl_se->runtime = pi_se->dl_runtime; | ||
421 | } | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * If the entity depleted all its runtime, and if we want it to sleep | ||
426 | * while waiting for some new execution time to become available, we | ||
427 | * set the bandwidth enforcement timer to the replenishment instant | ||
428 | * and try to activate it. | ||
429 | * | ||
430 | * Notice that it is important for the caller to know if the timer | ||
431 | * actually started or not (i.e., the replenishment instant is in | ||
432 | * the future or in the past). | ||
433 | */ | ||
434 | static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) | ||
435 | { | ||
436 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
437 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
438 | ktime_t now, act; | ||
439 | ktime_t soft, hard; | ||
440 | unsigned long range; | ||
441 | s64 delta; | ||
442 | |||
443 | if (boosted) | ||
444 | return 0; | ||
445 | /* | ||
446 | * We want the timer to fire at the deadline, but considering | ||
447 | * that it is actually coming from rq->clock and not from | ||
448 | * hrtimer's time base reading. | ||
449 | */ | ||
450 | act = ns_to_ktime(dl_se->deadline); | ||
451 | now = hrtimer_cb_get_time(&dl_se->dl_timer); | ||
452 | delta = ktime_to_ns(now) - rq_clock(rq); | ||
453 | act = ktime_add_ns(act, delta); | ||
454 | |||
455 | /* | ||
456 | * If the expiry time already passed, e.g., because the value | ||
457 | * chosen as the deadline is too small, don't even try to | ||
458 | * start the timer in the past! | ||
459 | */ | ||
460 | if (ktime_us_delta(act, now) < 0) | ||
461 | return 0; | ||
462 | |||
463 | hrtimer_set_expires(&dl_se->dl_timer, act); | ||
464 | |||
465 | soft = hrtimer_get_softexpires(&dl_se->dl_timer); | ||
466 | hard = hrtimer_get_expires(&dl_se->dl_timer); | ||
467 | range = ktime_to_ns(ktime_sub(hard, soft)); | ||
468 | __hrtimer_start_range_ns(&dl_se->dl_timer, soft, | ||
469 | range, HRTIMER_MODE_ABS, 0); | ||
470 | |||
471 | return hrtimer_active(&dl_se->dl_timer); | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * This is the bandwidth enforcement timer callback. If here, we know | ||
476 | * a task is not on its dl_rq, since the fact that the timer was running | ||
477 | * means the task is throttled and needs a runtime replenishment. | ||
478 | * | ||
479 | * However, what we actually do depends on the fact the task is active, | ||
480 | * (it is on its rq) or has been removed from there by a call to | ||
481 | * dequeue_task_dl(). In the former case we must issue the runtime | ||
482 | * replenishment and add the task back to the dl_rq; in the latter, we just | ||
483 | * do nothing but clearing dl_throttled, so that runtime and deadline | ||
484 | * updating (and the queueing back to dl_rq) will be done by the | ||
485 | * next call to enqueue_task_dl(). | ||
486 | */ | ||
487 | static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | ||
488 | { | ||
489 | struct sched_dl_entity *dl_se = container_of(timer, | ||
490 | struct sched_dl_entity, | ||
491 | dl_timer); | ||
492 | struct task_struct *p = dl_task_of(dl_se); | ||
493 | struct rq *rq = task_rq(p); | ||
494 | raw_spin_lock(&rq->lock); | ||
495 | |||
496 | /* | ||
497 | * We need to take care of a possible races here. In fact, the | ||
498 | * task might have changed its scheduling policy to something | ||
499 | * different from SCHED_DEADLINE or changed its reservation | ||
500 | * parameters (through sched_setscheduler()). | ||
501 | */ | ||
502 | if (!dl_task(p) || dl_se->dl_new) | ||
503 | goto unlock; | ||
504 | |||
505 | sched_clock_tick(); | ||
506 | update_rq_clock(rq); | ||
507 | dl_se->dl_throttled = 0; | ||
508 | if (p->on_rq) { | ||
509 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
510 | if (task_has_dl_policy(rq->curr)) | ||
511 | check_preempt_curr_dl(rq, p, 0); | ||
512 | else | ||
513 | resched_task(rq->curr); | ||
514 | #ifdef CONFIG_SMP | ||
515 | /* | ||
516 | * Queueing this task back might have overloaded rq, | ||
517 | * check if we need to kick someone away. | ||
518 | */ | ||
519 | if (has_pushable_dl_tasks(rq)) | ||
520 | push_dl_task(rq); | ||
521 | #endif | ||
522 | } | ||
523 | unlock: | ||
524 | raw_spin_unlock(&rq->lock); | ||
525 | |||
526 | return HRTIMER_NORESTART; | ||
527 | } | ||
528 | |||
529 | void init_dl_task_timer(struct sched_dl_entity *dl_se) | ||
530 | { | ||
531 | struct hrtimer *timer = &dl_se->dl_timer; | ||
532 | |||
533 | if (hrtimer_active(timer)) { | ||
534 | hrtimer_try_to_cancel(timer); | ||
535 | return; | ||
536 | } | ||
537 | |||
538 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
539 | timer->function = dl_task_timer; | ||
540 | } | ||
541 | |||
542 | static | ||
543 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | ||
544 | { | ||
545 | int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); | ||
546 | int rorun = dl_se->runtime <= 0; | ||
547 | |||
548 | if (!rorun && !dmiss) | ||
549 | return 0; | ||
550 | |||
551 | /* | ||
552 | * If we are beyond our current deadline and we are still | ||
553 | * executing, then we have already used some of the runtime of | ||
554 | * the next instance. Thus, if we do not account that, we are | ||
555 | * stealing bandwidth from the system at each deadline miss! | ||
556 | */ | ||
557 | if (dmiss) { | ||
558 | dl_se->runtime = rorun ? dl_se->runtime : 0; | ||
559 | dl_se->runtime -= rq_clock(rq) - dl_se->deadline; | ||
560 | } | ||
561 | |||
562 | return 1; | ||
563 | } | ||
564 | |||
565 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); | ||
566 | |||
567 | /* | ||
568 | * Update the current task's runtime statistics (provided it is still | ||
569 | * a -deadline task and has not been removed from the dl_rq). | ||
570 | */ | ||
571 | static void update_curr_dl(struct rq *rq) | ||
572 | { | ||
573 | struct task_struct *curr = rq->curr; | ||
574 | struct sched_dl_entity *dl_se = &curr->dl; | ||
575 | u64 delta_exec; | ||
576 | |||
577 | if (!dl_task(curr) || !on_dl_rq(dl_se)) | ||
578 | return; | ||
579 | |||
580 | /* | ||
581 | * Consumed budget is computed considering the time as | ||
582 | * observed by schedulable tasks (excluding time spent | ||
583 | * in hardirq context, etc.). Deadlines are instead | ||
584 | * computed using hard walltime. This seems to be the more | ||
585 | * natural solution, but the full ramifications of this | ||
586 | * approach need further study. | ||
587 | */ | ||
588 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | ||
589 | if (unlikely((s64)delta_exec < 0)) | ||
590 | delta_exec = 0; | ||
591 | |||
592 | schedstat_set(curr->se.statistics.exec_max, | ||
593 | max(curr->se.statistics.exec_max, delta_exec)); | ||
594 | |||
595 | curr->se.sum_exec_runtime += delta_exec; | ||
596 | account_group_exec_runtime(curr, delta_exec); | ||
597 | |||
598 | curr->se.exec_start = rq_clock_task(rq); | ||
599 | cpuacct_charge(curr, delta_exec); | ||
600 | |||
601 | sched_rt_avg_update(rq, delta_exec); | ||
602 | |||
603 | dl_se->runtime -= delta_exec; | ||
604 | if (dl_runtime_exceeded(rq, dl_se)) { | ||
605 | __dequeue_task_dl(rq, curr, 0); | ||
606 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | ||
607 | dl_se->dl_throttled = 1; | ||
608 | else | ||
609 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | ||
610 | |||
611 | if (!is_leftmost(curr, &rq->dl)) | ||
612 | resched_task(curr); | ||
613 | } | ||
614 | |||
615 | /* | ||
616 | * Because -- for now -- we share the rt bandwidth, we need to | ||
617 | * account our runtime there too, otherwise actual rt tasks | ||
618 | * would be able to exceed the shared quota. | ||
619 | * | ||
620 | * Account to the root rt group for now. | ||
621 | * | ||
622 | * The solution we're working towards is having the RT groups scheduled | ||
623 | * using deadline servers -- however there's a few nasties to figure | ||
624 | * out before that can happen. | ||
625 | */ | ||
626 | if (rt_bandwidth_enabled()) { | ||
627 | struct rt_rq *rt_rq = &rq->rt; | ||
628 | |||
629 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
630 | /* | ||
631 | * We'll let actual RT tasks worry about the overflow here, we | ||
632 | * have our own CBS to keep us inline; only account when RT | ||
633 | * bandwidth is relevant. | ||
634 | */ | ||
635 | if (sched_rt_bandwidth_account(rt_rq)) | ||
636 | rt_rq->rt_time += delta_exec; | ||
637 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
638 | } | ||
639 | } | ||
640 | |||
641 | #ifdef CONFIG_SMP | ||
642 | |||
643 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); | ||
644 | |||
645 | static inline u64 next_deadline(struct rq *rq) | ||
646 | { | ||
647 | struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); | ||
648 | |||
649 | if (next && dl_prio(next->prio)) | ||
650 | return next->dl.deadline; | ||
651 | else | ||
652 | return 0; | ||
653 | } | ||
654 | |||
655 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
656 | { | ||
657 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
658 | |||
659 | if (dl_rq->earliest_dl.curr == 0 || | ||
660 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | ||
661 | /* | ||
662 | * If the dl_rq had no -deadline tasks, or if the new task | ||
663 | * has shorter deadline than the current one on dl_rq, we | ||
664 | * know that the previous earliest becomes our next earliest, | ||
665 | * as the new task becomes the earliest itself. | ||
666 | */ | ||
667 | dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; | ||
668 | dl_rq->earliest_dl.curr = deadline; | ||
669 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | ||
670 | } else if (dl_rq->earliest_dl.next == 0 || | ||
671 | dl_time_before(deadline, dl_rq->earliest_dl.next)) { | ||
672 | /* | ||
673 | * On the other hand, if the new -deadline task has a | ||
674 | * a later deadline than the earliest one on dl_rq, but | ||
675 | * it is earlier than the next (if any), we must | ||
676 | * recompute the next-earliest. | ||
677 | */ | ||
678 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
679 | } | ||
680 | } | ||
681 | |||
682 | static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
683 | { | ||
684 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
685 | |||
686 | /* | ||
687 | * Since we may have removed our earliest (and/or next earliest) | ||
688 | * task we must recompute them. | ||
689 | */ | ||
690 | if (!dl_rq->dl_nr_running) { | ||
691 | dl_rq->earliest_dl.curr = 0; | ||
692 | dl_rq->earliest_dl.next = 0; | ||
693 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
694 | } else { | ||
695 | struct rb_node *leftmost = dl_rq->rb_leftmost; | ||
696 | struct sched_dl_entity *entry; | ||
697 | |||
698 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | ||
699 | dl_rq->earliest_dl.curr = entry->deadline; | ||
700 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
701 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | ||
702 | } | ||
703 | } | ||
704 | |||
705 | #else | ||
706 | |||
707 | static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
708 | static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
709 | |||
710 | #endif /* CONFIG_SMP */ | ||
711 | |||
712 | static inline | ||
713 | void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
714 | { | ||
715 | int prio = dl_task_of(dl_se)->prio; | ||
716 | u64 deadline = dl_se->deadline; | ||
717 | |||
718 | WARN_ON(!dl_prio(prio)); | ||
719 | dl_rq->dl_nr_running++; | ||
720 | inc_nr_running(rq_of_dl_rq(dl_rq)); | ||
721 | |||
722 | inc_dl_deadline(dl_rq, deadline); | ||
723 | inc_dl_migration(dl_se, dl_rq); | ||
724 | } | ||
725 | |||
726 | static inline | ||
727 | void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
728 | { | ||
729 | int prio = dl_task_of(dl_se)->prio; | ||
730 | |||
731 | WARN_ON(!dl_prio(prio)); | ||
732 | WARN_ON(!dl_rq->dl_nr_running); | ||
733 | dl_rq->dl_nr_running--; | ||
734 | dec_nr_running(rq_of_dl_rq(dl_rq)); | ||
735 | |||
736 | dec_dl_deadline(dl_rq, dl_se->deadline); | ||
737 | dec_dl_migration(dl_se, dl_rq); | ||
738 | } | ||
739 | |||
740 | static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) | ||
741 | { | ||
742 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
743 | struct rb_node **link = &dl_rq->rb_root.rb_node; | ||
744 | struct rb_node *parent = NULL; | ||
745 | struct sched_dl_entity *entry; | ||
746 | int leftmost = 1; | ||
747 | |||
748 | BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); | ||
749 | |||
750 | while (*link) { | ||
751 | parent = *link; | ||
752 | entry = rb_entry(parent, struct sched_dl_entity, rb_node); | ||
753 | if (dl_time_before(dl_se->deadline, entry->deadline)) | ||
754 | link = &parent->rb_left; | ||
755 | else { | ||
756 | link = &parent->rb_right; | ||
757 | leftmost = 0; | ||
758 | } | ||
759 | } | ||
760 | |||
761 | if (leftmost) | ||
762 | dl_rq->rb_leftmost = &dl_se->rb_node; | ||
763 | |||
764 | rb_link_node(&dl_se->rb_node, parent, link); | ||
765 | rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); | ||
766 | |||
767 | inc_dl_tasks(dl_se, dl_rq); | ||
768 | } | ||
769 | |||
770 | static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
771 | { | ||
772 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
773 | |||
774 | if (RB_EMPTY_NODE(&dl_se->rb_node)) | ||
775 | return; | ||
776 | |||
777 | if (dl_rq->rb_leftmost == &dl_se->rb_node) { | ||
778 | struct rb_node *next_node; | ||
779 | |||
780 | next_node = rb_next(&dl_se->rb_node); | ||
781 | dl_rq->rb_leftmost = next_node; | ||
782 | } | ||
783 | |||
784 | rb_erase(&dl_se->rb_node, &dl_rq->rb_root); | ||
785 | RB_CLEAR_NODE(&dl_se->rb_node); | ||
786 | |||
787 | dec_dl_tasks(dl_se, dl_rq); | ||
788 | } | ||
789 | |||
790 | static void | ||
791 | enqueue_dl_entity(struct sched_dl_entity *dl_se, | ||
792 | struct sched_dl_entity *pi_se, int flags) | ||
793 | { | ||
794 | BUG_ON(on_dl_rq(dl_se)); | ||
795 | |||
796 | /* | ||
797 | * If this is a wakeup or a new instance, the scheduling | ||
798 | * parameters of the task might need updating. Otherwise, | ||
799 | * we want a replenishment of its runtime. | ||
800 | */ | ||
801 | if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) | ||
802 | replenish_dl_entity(dl_se, pi_se); | ||
803 | else | ||
804 | update_dl_entity(dl_se, pi_se); | ||
805 | |||
806 | __enqueue_dl_entity(dl_se); | ||
807 | } | ||
808 | |||
809 | static void dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
810 | { | ||
811 | __dequeue_dl_entity(dl_se); | ||
812 | } | ||
813 | |||
814 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
815 | { | ||
816 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | ||
817 | struct sched_dl_entity *pi_se = &p->dl; | ||
818 | |||
819 | /* | ||
820 | * Use the scheduling parameters of the top pi-waiter | ||
821 | * task if we have one and its (relative) deadline is | ||
822 | * smaller than our one... OTW we keep our runtime and | ||
823 | * deadline. | ||
824 | */ | ||
825 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | ||
826 | pi_se = &pi_task->dl; | ||
827 | |||
828 | /* | ||
829 | * If p is throttled, we do nothing. In fact, if it exhausted | ||
830 | * its budget it needs a replenishment and, since it now is on | ||
831 | * its rq, the bandwidth timer callback (which clearly has not | ||
832 | * run yet) will take care of this. | ||
833 | */ | ||
834 | if (p->dl.dl_throttled) | ||
835 | return; | ||
836 | |||
837 | enqueue_dl_entity(&p->dl, pi_se, flags); | ||
838 | |||
839 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | ||
840 | enqueue_pushable_dl_task(rq, p); | ||
841 | } | ||
842 | |||
843 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
844 | { | ||
845 | dequeue_dl_entity(&p->dl); | ||
846 | dequeue_pushable_dl_task(rq, p); | ||
847 | } | ||
848 | |||
849 | static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
850 | { | ||
851 | update_curr_dl(rq); | ||
852 | __dequeue_task_dl(rq, p, flags); | ||
853 | } | ||
854 | |||
855 | /* | ||
856 | * Yield task semantic for -deadline tasks is: | ||
857 | * | ||
858 | * get off from the CPU until our next instance, with | ||
859 | * a new runtime. This is of little use now, since we | ||
860 | * don't have a bandwidth reclaiming mechanism. Anyway, | ||
861 | * bandwidth reclaiming is planned for the future, and | ||
862 | * yield_task_dl will indicate that some spare budget | ||
863 | * is available for other task instances to use it. | ||
864 | */ | ||
865 | static void yield_task_dl(struct rq *rq) | ||
866 | { | ||
867 | struct task_struct *p = rq->curr; | ||
868 | |||
869 | /* | ||
870 | * We make the task go to sleep until its current deadline by | ||
871 | * forcing its runtime to zero. This way, update_curr_dl() stops | ||
872 | * it and the bandwidth timer will wake it up and will give it | ||
873 | * new scheduling parameters (thanks to dl_new=1). | ||
874 | */ | ||
875 | if (p->dl.runtime > 0) { | ||
876 | rq->curr->dl.dl_new = 1; | ||
877 | p->dl.runtime = 0; | ||
878 | } | ||
879 | update_curr_dl(rq); | ||
880 | } | ||
881 | |||
882 | #ifdef CONFIG_SMP | ||
883 | |||
884 | static int find_later_rq(struct task_struct *task); | ||
885 | |||
886 | static int | ||
887 | select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
888 | { | ||
889 | struct task_struct *curr; | ||
890 | struct rq *rq; | ||
891 | |||
892 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
893 | goto out; | ||
894 | |||
895 | rq = cpu_rq(cpu); | ||
896 | |||
897 | rcu_read_lock(); | ||
898 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
899 | |||
900 | /* | ||
901 | * If we are dealing with a -deadline task, we must | ||
902 | * decide where to wake it up. | ||
903 | * If it has a later deadline and the current task | ||
904 | * on this rq can't move (provided the waking task | ||
905 | * can!) we prefer to send it somewhere else. On the | ||
906 | * other hand, if it has a shorter deadline, we | ||
907 | * try to make it stay here, it might be important. | ||
908 | */ | ||
909 | if (unlikely(dl_task(curr)) && | ||
910 | (curr->nr_cpus_allowed < 2 || | ||
911 | !dl_entity_preempt(&p->dl, &curr->dl)) && | ||
912 | (p->nr_cpus_allowed > 1)) { | ||
913 | int target = find_later_rq(p); | ||
914 | |||
915 | if (target != -1) | ||
916 | cpu = target; | ||
917 | } | ||
918 | rcu_read_unlock(); | ||
919 | |||
920 | out: | ||
921 | return cpu; | ||
922 | } | ||
923 | |||
924 | static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | ||
925 | { | ||
926 | /* | ||
927 | * Current can't be migrated, useless to reschedule, | ||
928 | * let's hope p can move out. | ||
929 | */ | ||
930 | if (rq->curr->nr_cpus_allowed == 1 || | ||
931 | cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) | ||
932 | return; | ||
933 | |||
934 | /* | ||
935 | * p is migratable, so let's not schedule it and | ||
936 | * see if it is pushed or pulled somewhere else. | ||
937 | */ | ||
938 | if (p->nr_cpus_allowed != 1 && | ||
939 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) | ||
940 | return; | ||
941 | |||
942 | resched_task(rq->curr); | ||
943 | } | ||
944 | |||
945 | #endif /* CONFIG_SMP */ | ||
946 | |||
947 | /* | ||
948 | * Only called when both the current and waking task are -deadline | ||
949 | * tasks. | ||
950 | */ | ||
951 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
952 | int flags) | ||
953 | { | ||
954 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { | ||
955 | resched_task(rq->curr); | ||
956 | return; | ||
957 | } | ||
958 | |||
959 | #ifdef CONFIG_SMP | ||
960 | /* | ||
961 | * In the unlikely case current and p have the same deadline | ||
962 | * let us try to decide what's the best thing to do... | ||
963 | */ | ||
964 | if ((p->dl.deadline == rq->curr->dl.deadline) && | ||
965 | !test_tsk_need_resched(rq->curr)) | ||
966 | check_preempt_equal_dl(rq, p); | ||
967 | #endif /* CONFIG_SMP */ | ||
968 | } | ||
969 | |||
970 | #ifdef CONFIG_SCHED_HRTICK | ||
971 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | ||
972 | { | ||
973 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | ||
974 | |||
975 | if (delta > 10000) | ||
976 | hrtick_start(rq, p->dl.runtime); | ||
977 | } | ||
978 | #endif | ||
979 | |||
980 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | ||
981 | struct dl_rq *dl_rq) | ||
982 | { | ||
983 | struct rb_node *left = dl_rq->rb_leftmost; | ||
984 | |||
985 | if (!left) | ||
986 | return NULL; | ||
987 | |||
988 | return rb_entry(left, struct sched_dl_entity, rb_node); | ||
989 | } | ||
990 | |||
991 | struct task_struct *pick_next_task_dl(struct rq *rq) | ||
992 | { | ||
993 | struct sched_dl_entity *dl_se; | ||
994 | struct task_struct *p; | ||
995 | struct dl_rq *dl_rq; | ||
996 | |||
997 | dl_rq = &rq->dl; | ||
998 | |||
999 | if (unlikely(!dl_rq->dl_nr_running)) | ||
1000 | return NULL; | ||
1001 | |||
1002 | dl_se = pick_next_dl_entity(rq, dl_rq); | ||
1003 | BUG_ON(!dl_se); | ||
1004 | |||
1005 | p = dl_task_of(dl_se); | ||
1006 | p->se.exec_start = rq_clock_task(rq); | ||
1007 | |||
1008 | /* Running task will never be pushed. */ | ||
1009 | dequeue_pushable_dl_task(rq, p); | ||
1010 | |||
1011 | #ifdef CONFIG_SCHED_HRTICK | ||
1012 | if (hrtick_enabled(rq)) | ||
1013 | start_hrtick_dl(rq, p); | ||
1014 | #endif | ||
1015 | |||
1016 | #ifdef CONFIG_SMP | ||
1017 | rq->post_schedule = has_pushable_dl_tasks(rq); | ||
1018 | #endif /* CONFIG_SMP */ | ||
1019 | |||
1020 | return p; | ||
1021 | } | ||
1022 | |||
1023 | static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | ||
1024 | { | ||
1025 | update_curr_dl(rq); | ||
1026 | |||
1027 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) | ||
1028 | enqueue_pushable_dl_task(rq, p); | ||
1029 | } | ||
1030 | |||
1031 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | ||
1032 | { | ||
1033 | update_curr_dl(rq); | ||
1034 | |||
1035 | #ifdef CONFIG_SCHED_HRTICK | ||
1036 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | ||
1037 | start_hrtick_dl(rq, p); | ||
1038 | #endif | ||
1039 | } | ||
1040 | |||
1041 | static void task_fork_dl(struct task_struct *p) | ||
1042 | { | ||
1043 | /* | ||
1044 | * SCHED_DEADLINE tasks cannot fork and this is achieved through | ||
1045 | * sched_fork() | ||
1046 | */ | ||
1047 | } | ||
1048 | |||
1049 | static void task_dead_dl(struct task_struct *p) | ||
1050 | { | ||
1051 | struct hrtimer *timer = &p->dl.dl_timer; | ||
1052 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
1053 | |||
1054 | /* | ||
1055 | * Since we are TASK_DEAD we won't slip out of the domain! | ||
1056 | */ | ||
1057 | raw_spin_lock_irq(&dl_b->lock); | ||
1058 | dl_b->total_bw -= p->dl.dl_bw; | ||
1059 | raw_spin_unlock_irq(&dl_b->lock); | ||
1060 | |||
1061 | hrtimer_cancel(timer); | ||
1062 | } | ||
1063 | |||
1064 | static void set_curr_task_dl(struct rq *rq) | ||
1065 | { | ||
1066 | struct task_struct *p = rq->curr; | ||
1067 | |||
1068 | p->se.exec_start = rq_clock_task(rq); | ||
1069 | |||
1070 | /* You can't push away the running task */ | ||
1071 | dequeue_pushable_dl_task(rq, p); | ||
1072 | } | ||
1073 | |||
1074 | #ifdef CONFIG_SMP | ||
1075 | |||
1076 | /* Only try algorithms three times */ | ||
1077 | #define DL_MAX_TRIES 3 | ||
1078 | |||
1079 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | ||
1080 | { | ||
1081 | if (!task_running(rq, p) && | ||
1082 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | ||
1083 | (p->nr_cpus_allowed > 1)) | ||
1084 | return 1; | ||
1085 | |||
1086 | return 0; | ||
1087 | } | ||
1088 | |||
1089 | /* Returns the second earliest -deadline task, NULL otherwise */ | ||
1090 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) | ||
1091 | { | ||
1092 | struct rb_node *next_node = rq->dl.rb_leftmost; | ||
1093 | struct sched_dl_entity *dl_se; | ||
1094 | struct task_struct *p = NULL; | ||
1095 | |||
1096 | next_node: | ||
1097 | next_node = rb_next(next_node); | ||
1098 | if (next_node) { | ||
1099 | dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); | ||
1100 | p = dl_task_of(dl_se); | ||
1101 | |||
1102 | if (pick_dl_task(rq, p, cpu)) | ||
1103 | return p; | ||
1104 | |||
1105 | goto next_node; | ||
1106 | } | ||
1107 | |||
1108 | return NULL; | ||
1109 | } | ||
1110 | |||
1111 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); | ||
1112 | |||
1113 | static int find_later_rq(struct task_struct *task) | ||
1114 | { | ||
1115 | struct sched_domain *sd; | ||
1116 | struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); | ||
1117 | int this_cpu = smp_processor_id(); | ||
1118 | int best_cpu, cpu = task_cpu(task); | ||
1119 | |||
1120 | /* Make sure the mask is initialized first */ | ||
1121 | if (unlikely(!later_mask)) | ||
1122 | return -1; | ||
1123 | |||
1124 | if (task->nr_cpus_allowed == 1) | ||
1125 | return -1; | ||
1126 | |||
1127 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | ||
1128 | task, later_mask); | ||
1129 | if (best_cpu == -1) | ||
1130 | return -1; | ||
1131 | |||
1132 | /* | ||
1133 | * If we are here, some target has been found, | ||
1134 | * the most suitable of which is cached in best_cpu. | ||
1135 | * This is, among the runqueues where the current tasks | ||
1136 | * have later deadlines than the task's one, the rq | ||
1137 | * with the latest possible one. | ||
1138 | * | ||
1139 | * Now we check how well this matches with task's | ||
1140 | * affinity and system topology. | ||
1141 | * | ||
1142 | * The last cpu where the task run is our first | ||
1143 | * guess, since it is most likely cache-hot there. | ||
1144 | */ | ||
1145 | if (cpumask_test_cpu(cpu, later_mask)) | ||
1146 | return cpu; | ||
1147 | /* | ||
1148 | * Check if this_cpu is to be skipped (i.e., it is | ||
1149 | * not in the mask) or not. | ||
1150 | */ | ||
1151 | if (!cpumask_test_cpu(this_cpu, later_mask)) | ||
1152 | this_cpu = -1; | ||
1153 | |||
1154 | rcu_read_lock(); | ||
1155 | for_each_domain(cpu, sd) { | ||
1156 | if (sd->flags & SD_WAKE_AFFINE) { | ||
1157 | |||
1158 | /* | ||
1159 | * If possible, preempting this_cpu is | ||
1160 | * cheaper than migrating. | ||
1161 | */ | ||
1162 | if (this_cpu != -1 && | ||
1163 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { | ||
1164 | rcu_read_unlock(); | ||
1165 | return this_cpu; | ||
1166 | } | ||
1167 | |||
1168 | /* | ||
1169 | * Last chance: if best_cpu is valid and is | ||
1170 | * in the mask, that becomes our choice. | ||
1171 | */ | ||
1172 | if (best_cpu < nr_cpu_ids && | ||
1173 | cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { | ||
1174 | rcu_read_unlock(); | ||
1175 | return best_cpu; | ||
1176 | } | ||
1177 | } | ||
1178 | } | ||
1179 | rcu_read_unlock(); | ||
1180 | |||
1181 | /* | ||
1182 | * At this point, all our guesses failed, we just return | ||
1183 | * 'something', and let the caller sort the things out. | ||
1184 | */ | ||
1185 | if (this_cpu != -1) | ||
1186 | return this_cpu; | ||
1187 | |||
1188 | cpu = cpumask_any(later_mask); | ||
1189 | if (cpu < nr_cpu_ids) | ||
1190 | return cpu; | ||
1191 | |||
1192 | return -1; | ||
1193 | } | ||
1194 | |||
1195 | /* Locks the rq it finds */ | ||
1196 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | ||
1197 | { | ||
1198 | struct rq *later_rq = NULL; | ||
1199 | int tries; | ||
1200 | int cpu; | ||
1201 | |||
1202 | for (tries = 0; tries < DL_MAX_TRIES; tries++) { | ||
1203 | cpu = find_later_rq(task); | ||
1204 | |||
1205 | if ((cpu == -1) || (cpu == rq->cpu)) | ||
1206 | break; | ||
1207 | |||
1208 | later_rq = cpu_rq(cpu); | ||
1209 | |||
1210 | /* Retry if something changed. */ | ||
1211 | if (double_lock_balance(rq, later_rq)) { | ||
1212 | if (unlikely(task_rq(task) != rq || | ||
1213 | !cpumask_test_cpu(later_rq->cpu, | ||
1214 | &task->cpus_allowed) || | ||
1215 | task_running(rq, task) || !task->on_rq)) { | ||
1216 | double_unlock_balance(rq, later_rq); | ||
1217 | later_rq = NULL; | ||
1218 | break; | ||
1219 | } | ||
1220 | } | ||
1221 | |||
1222 | /* | ||
1223 | * If the rq we found has no -deadline task, or | ||
1224 | * its earliest one has a later deadline than our | ||
1225 | * task, the rq is a good one. | ||
1226 | */ | ||
1227 | if (!later_rq->dl.dl_nr_running || | ||
1228 | dl_time_before(task->dl.deadline, | ||
1229 | later_rq->dl.earliest_dl.curr)) | ||
1230 | break; | ||
1231 | |||
1232 | /* Otherwise we try again. */ | ||
1233 | double_unlock_balance(rq, later_rq); | ||
1234 | later_rq = NULL; | ||
1235 | } | ||
1236 | |||
1237 | return later_rq; | ||
1238 | } | ||
1239 | |||
1240 | static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | ||
1241 | { | ||
1242 | struct task_struct *p; | ||
1243 | |||
1244 | if (!has_pushable_dl_tasks(rq)) | ||
1245 | return NULL; | ||
1246 | |||
1247 | p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, | ||
1248 | struct task_struct, pushable_dl_tasks); | ||
1249 | |||
1250 | BUG_ON(rq->cpu != task_cpu(p)); | ||
1251 | BUG_ON(task_current(rq, p)); | ||
1252 | BUG_ON(p->nr_cpus_allowed <= 1); | ||
1253 | |||
1254 | BUG_ON(!p->on_rq); | ||
1255 | BUG_ON(!dl_task(p)); | ||
1256 | |||
1257 | return p; | ||
1258 | } | ||
1259 | |||
1260 | /* | ||
1261 | * See if the non running -deadline tasks on this rq | ||
1262 | * can be sent to some other CPU where they can preempt | ||
1263 | * and start executing. | ||
1264 | */ | ||
1265 | static int push_dl_task(struct rq *rq) | ||
1266 | { | ||
1267 | struct task_struct *next_task; | ||
1268 | struct rq *later_rq; | ||
1269 | |||
1270 | if (!rq->dl.overloaded) | ||
1271 | return 0; | ||
1272 | |||
1273 | next_task = pick_next_pushable_dl_task(rq); | ||
1274 | if (!next_task) | ||
1275 | return 0; | ||
1276 | |||
1277 | retry: | ||
1278 | if (unlikely(next_task == rq->curr)) { | ||
1279 | WARN_ON(1); | ||
1280 | return 0; | ||
1281 | } | ||
1282 | |||
1283 | /* | ||
1284 | * If next_task preempts rq->curr, and rq->curr | ||
1285 | * can move away, it makes sense to just reschedule | ||
1286 | * without going further in pushing next_task. | ||
1287 | */ | ||
1288 | if (dl_task(rq->curr) && | ||
1289 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && | ||
1290 | rq->curr->nr_cpus_allowed > 1) { | ||
1291 | resched_task(rq->curr); | ||
1292 | return 0; | ||
1293 | } | ||
1294 | |||
1295 | /* We might release rq lock */ | ||
1296 | get_task_struct(next_task); | ||
1297 | |||
1298 | /* Will lock the rq it'll find */ | ||
1299 | later_rq = find_lock_later_rq(next_task, rq); | ||
1300 | if (!later_rq) { | ||
1301 | struct task_struct *task; | ||
1302 | |||
1303 | /* | ||
1304 | * We must check all this again, since | ||
1305 | * find_lock_later_rq releases rq->lock and it is | ||
1306 | * then possible that next_task has migrated. | ||
1307 | */ | ||
1308 | task = pick_next_pushable_dl_task(rq); | ||
1309 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | ||
1310 | /* | ||
1311 | * The task is still there. We don't try | ||
1312 | * again, some other cpu will pull it when ready. | ||
1313 | */ | ||
1314 | dequeue_pushable_dl_task(rq, next_task); | ||
1315 | goto out; | ||
1316 | } | ||
1317 | |||
1318 | if (!task) | ||
1319 | /* No more tasks */ | ||
1320 | goto out; | ||
1321 | |||
1322 | put_task_struct(next_task); | ||
1323 | next_task = task; | ||
1324 | goto retry; | ||
1325 | } | ||
1326 | |||
1327 | deactivate_task(rq, next_task, 0); | ||
1328 | set_task_cpu(next_task, later_rq->cpu); | ||
1329 | activate_task(later_rq, next_task, 0); | ||
1330 | |||
1331 | resched_task(later_rq->curr); | ||
1332 | |||
1333 | double_unlock_balance(rq, later_rq); | ||
1334 | |||
1335 | out: | ||
1336 | put_task_struct(next_task); | ||
1337 | |||
1338 | return 1; | ||
1339 | } | ||
1340 | |||
1341 | static void push_dl_tasks(struct rq *rq) | ||
1342 | { | ||
1343 | /* Terminates as it moves a -deadline task */ | ||
1344 | while (push_dl_task(rq)) | ||
1345 | ; | ||
1346 | } | ||
1347 | |||
1348 | static int pull_dl_task(struct rq *this_rq) | ||
1349 | { | ||
1350 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
1351 | struct task_struct *p; | ||
1352 | struct rq *src_rq; | ||
1353 | u64 dmin = LONG_MAX; | ||
1354 | |||
1355 | if (likely(!dl_overloaded(this_rq))) | ||
1356 | return 0; | ||
1357 | |||
1358 | /* | ||
1359 | * Match the barrier from dl_set_overloaded; this guarantees that if we | ||
1360 | * see overloaded we must also see the dlo_mask bit. | ||
1361 | */ | ||
1362 | smp_rmb(); | ||
1363 | |||
1364 | for_each_cpu(cpu, this_rq->rd->dlo_mask) { | ||
1365 | if (this_cpu == cpu) | ||
1366 | continue; | ||
1367 | |||
1368 | src_rq = cpu_rq(cpu); | ||
1369 | |||
1370 | /* | ||
1371 | * It looks racy, abd it is! However, as in sched_rt.c, | ||
1372 | * we are fine with this. | ||
1373 | */ | ||
1374 | if (this_rq->dl.dl_nr_running && | ||
1375 | dl_time_before(this_rq->dl.earliest_dl.curr, | ||
1376 | src_rq->dl.earliest_dl.next)) | ||
1377 | continue; | ||
1378 | |||
1379 | /* Might drop this_rq->lock */ | ||
1380 | double_lock_balance(this_rq, src_rq); | ||
1381 | |||
1382 | /* | ||
1383 | * If there are no more pullable tasks on the | ||
1384 | * rq, we're done with it. | ||
1385 | */ | ||
1386 | if (src_rq->dl.dl_nr_running <= 1) | ||
1387 | goto skip; | ||
1388 | |||
1389 | p = pick_next_earliest_dl_task(src_rq, this_cpu); | ||
1390 | |||
1391 | /* | ||
1392 | * We found a task to be pulled if: | ||
1393 | * - it preempts our current (if there's one), | ||
1394 | * - it will preempt the last one we pulled (if any). | ||
1395 | */ | ||
1396 | if (p && dl_time_before(p->dl.deadline, dmin) && | ||
1397 | (!this_rq->dl.dl_nr_running || | ||
1398 | dl_time_before(p->dl.deadline, | ||
1399 | this_rq->dl.earliest_dl.curr))) { | ||
1400 | WARN_ON(p == src_rq->curr); | ||
1401 | WARN_ON(!p->on_rq); | ||
1402 | |||
1403 | /* | ||
1404 | * Then we pull iff p has actually an earlier | ||
1405 | * deadline than the current task of its runqueue. | ||
1406 | */ | ||
1407 | if (dl_time_before(p->dl.deadline, | ||
1408 | src_rq->curr->dl.deadline)) | ||
1409 | goto skip; | ||
1410 | |||
1411 | ret = 1; | ||
1412 | |||
1413 | deactivate_task(src_rq, p, 0); | ||
1414 | set_task_cpu(p, this_cpu); | ||
1415 | activate_task(this_rq, p, 0); | ||
1416 | dmin = p->dl.deadline; | ||
1417 | |||
1418 | /* Is there any other task even earlier? */ | ||
1419 | } | ||
1420 | skip: | ||
1421 | double_unlock_balance(this_rq, src_rq); | ||
1422 | } | ||
1423 | |||
1424 | return ret; | ||
1425 | } | ||
1426 | |||
1427 | static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) | ||
1428 | { | ||
1429 | /* Try to pull other tasks here */ | ||
1430 | if (dl_task(prev)) | ||
1431 | pull_dl_task(rq); | ||
1432 | } | ||
1433 | |||
1434 | static void post_schedule_dl(struct rq *rq) | ||
1435 | { | ||
1436 | push_dl_tasks(rq); | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * Since the task is not running and a reschedule is not going to happen | ||
1441 | * anytime soon on its runqueue, we try pushing it away now. | ||
1442 | */ | ||
1443 | static void task_woken_dl(struct rq *rq, struct task_struct *p) | ||
1444 | { | ||
1445 | if (!task_running(rq, p) && | ||
1446 | !test_tsk_need_resched(rq->curr) && | ||
1447 | has_pushable_dl_tasks(rq) && | ||
1448 | p->nr_cpus_allowed > 1 && | ||
1449 | dl_task(rq->curr) && | ||
1450 | (rq->curr->nr_cpus_allowed < 2 || | ||
1451 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | ||
1452 | push_dl_tasks(rq); | ||
1453 | } | ||
1454 | } | ||
1455 | |||
1456 | static void set_cpus_allowed_dl(struct task_struct *p, | ||
1457 | const struct cpumask *new_mask) | ||
1458 | { | ||
1459 | struct rq *rq; | ||
1460 | int weight; | ||
1461 | |||
1462 | BUG_ON(!dl_task(p)); | ||
1463 | |||
1464 | /* | ||
1465 | * Update only if the task is actually running (i.e., | ||
1466 | * it is on the rq AND it is not throttled). | ||
1467 | */ | ||
1468 | if (!on_dl_rq(&p->dl)) | ||
1469 | return; | ||
1470 | |||
1471 | weight = cpumask_weight(new_mask); | ||
1472 | |||
1473 | /* | ||
1474 | * Only update if the process changes its state from whether it | ||
1475 | * can migrate or not. | ||
1476 | */ | ||
1477 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | ||
1478 | return; | ||
1479 | |||
1480 | rq = task_rq(p); | ||
1481 | |||
1482 | /* | ||
1483 | * The process used to be able to migrate OR it can now migrate | ||
1484 | */ | ||
1485 | if (weight <= 1) { | ||
1486 | if (!task_current(rq, p)) | ||
1487 | dequeue_pushable_dl_task(rq, p); | ||
1488 | BUG_ON(!rq->dl.dl_nr_migratory); | ||
1489 | rq->dl.dl_nr_migratory--; | ||
1490 | } else { | ||
1491 | if (!task_current(rq, p)) | ||
1492 | enqueue_pushable_dl_task(rq, p); | ||
1493 | rq->dl.dl_nr_migratory++; | ||
1494 | } | ||
1495 | |||
1496 | update_dl_migration(&rq->dl); | ||
1497 | } | ||
1498 | |||
1499 | /* Assumes rq->lock is held */ | ||
1500 | static void rq_online_dl(struct rq *rq) | ||
1501 | { | ||
1502 | if (rq->dl.overloaded) | ||
1503 | dl_set_overload(rq); | ||
1504 | |||
1505 | if (rq->dl.dl_nr_running > 0) | ||
1506 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | ||
1507 | } | ||
1508 | |||
1509 | /* Assumes rq->lock is held */ | ||
1510 | static void rq_offline_dl(struct rq *rq) | ||
1511 | { | ||
1512 | if (rq->dl.overloaded) | ||
1513 | dl_clear_overload(rq); | ||
1514 | |||
1515 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
1516 | } | ||
1517 | |||
1518 | void init_sched_dl_class(void) | ||
1519 | { | ||
1520 | unsigned int i; | ||
1521 | |||
1522 | for_each_possible_cpu(i) | ||
1523 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), | ||
1524 | GFP_KERNEL, cpu_to_node(i)); | ||
1525 | } | ||
1526 | |||
1527 | #endif /* CONFIG_SMP */ | ||
1528 | |||
1529 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | ||
1530 | { | ||
1531 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | ||
1532 | hrtimer_try_to_cancel(&p->dl.dl_timer); | ||
1533 | |||
1534 | #ifdef CONFIG_SMP | ||
1535 | /* | ||
1536 | * Since this might be the only -deadline task on the rq, | ||
1537 | * this is the right place to try to pull some other one | ||
1538 | * from an overloaded cpu, if any. | ||
1539 | */ | ||
1540 | if (!rq->dl.dl_nr_running) | ||
1541 | pull_dl_task(rq); | ||
1542 | #endif | ||
1543 | } | ||
1544 | |||
1545 | /* | ||
1546 | * When switching to -deadline, we may overload the rq, then | ||
1547 | * we try to push someone off, if possible. | ||
1548 | */ | ||
1549 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | ||
1550 | { | ||
1551 | int check_resched = 1; | ||
1552 | |||
1553 | /* | ||
1554 | * If p is throttled, don't consider the possibility | ||
1555 | * of preempting rq->curr, the check will be done right | ||
1556 | * after its runtime will get replenished. | ||
1557 | */ | ||
1558 | if (unlikely(p->dl.dl_throttled)) | ||
1559 | return; | ||
1560 | |||
1561 | if (p->on_rq || rq->curr != p) { | ||
1562 | #ifdef CONFIG_SMP | ||
1563 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | ||
1564 | /* Only reschedule if pushing failed */ | ||
1565 | check_resched = 0; | ||
1566 | #endif /* CONFIG_SMP */ | ||
1567 | if (check_resched && task_has_dl_policy(rq->curr)) | ||
1568 | check_preempt_curr_dl(rq, p, 0); | ||
1569 | } | ||
1570 | } | ||
1571 | |||
1572 | /* | ||
1573 | * If the scheduling parameters of a -deadline task changed, | ||
1574 | * a push or pull operation might be needed. | ||
1575 | */ | ||
1576 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | ||
1577 | int oldprio) | ||
1578 | { | ||
1579 | if (p->on_rq || rq->curr == p) { | ||
1580 | #ifdef CONFIG_SMP | ||
1581 | /* | ||
1582 | * This might be too much, but unfortunately | ||
1583 | * we don't have the old deadline value, and | ||
1584 | * we can't argue if the task is increasing | ||
1585 | * or lowering its prio, so... | ||
1586 | */ | ||
1587 | if (!rq->dl.overloaded) | ||
1588 | pull_dl_task(rq); | ||
1589 | |||
1590 | /* | ||
1591 | * If we now have a earlier deadline task than p, | ||
1592 | * then reschedule, provided p is still on this | ||
1593 | * runqueue. | ||
1594 | */ | ||
1595 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && | ||
1596 | rq->curr == p) | ||
1597 | resched_task(p); | ||
1598 | #else | ||
1599 | /* | ||
1600 | * Again, we don't know if p has a earlier | ||
1601 | * or later deadline, so let's blindly set a | ||
1602 | * (maybe not needed) rescheduling point. | ||
1603 | */ | ||
1604 | resched_task(p); | ||
1605 | #endif /* CONFIG_SMP */ | ||
1606 | } else | ||
1607 | switched_to_dl(rq, p); | ||
1608 | } | ||
1609 | |||
1610 | const struct sched_class dl_sched_class = { | ||
1611 | .next = &rt_sched_class, | ||
1612 | .enqueue_task = enqueue_task_dl, | ||
1613 | .dequeue_task = dequeue_task_dl, | ||
1614 | .yield_task = yield_task_dl, | ||
1615 | |||
1616 | .check_preempt_curr = check_preempt_curr_dl, | ||
1617 | |||
1618 | .pick_next_task = pick_next_task_dl, | ||
1619 | .put_prev_task = put_prev_task_dl, | ||
1620 | |||
1621 | #ifdef CONFIG_SMP | ||
1622 | .select_task_rq = select_task_rq_dl, | ||
1623 | .set_cpus_allowed = set_cpus_allowed_dl, | ||
1624 | .rq_online = rq_online_dl, | ||
1625 | .rq_offline = rq_offline_dl, | ||
1626 | .pre_schedule = pre_schedule_dl, | ||
1627 | .post_schedule = post_schedule_dl, | ||
1628 | .task_woken = task_woken_dl, | ||
1629 | #endif | ||
1630 | |||
1631 | .set_curr_task = set_curr_task_dl, | ||
1632 | .task_tick = task_tick_dl, | ||
1633 | .task_fork = task_fork_dl, | ||
1634 | .task_dead = task_dead_dl, | ||
1635 | |||
1636 | .prio_changed = prio_changed_dl, | ||
1637 | .switched_from = switched_from_dl, | ||
1638 | .switched_to = switched_to_dl, | ||
1639 | }; | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
140 | #endif | 140 | #endif |
141 | #ifdef CONFIG_NUMA_BALANCING | 141 | #ifdef CONFIG_NUMA_BALANCING |
142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | 142 | SEQ_printf(m, " %d", task_node(p)); |
143 | #endif | 143 | #endif |
144 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) | |||
371 | PN(cpu_clk); | 371 | PN(cpu_clk); |
372 | P(jiffies); | 372 | P(jiffies); |
373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
374 | P(sched_clock_stable); | 374 | P(sched_clock_stable()); |
375 | #endif | 375 | #endif |
376 | #undef PN | 376 | #undef PN |
377 | #undef P | 377 | #undef P |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7395d97e4cb..9b4c4f320130 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
872 | return max(smin, smax); | 872 | return max(smin, smax); |
873 | } | 873 | } |
874 | 874 | ||
875 | /* | ||
876 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
877 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
878 | * scans. This will give the process the chance to accumulate more faults on | ||
879 | * the preferred node but still allow the scheduler to move the task again if | ||
880 | * the nodes CPUs are overloaded. | ||
881 | */ | ||
882 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
883 | |||
884 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 875 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
885 | { | 876 | { |
886 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 877 | rq->nr_numa_running += (p->numa_preferred_nid != -1); |
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
930 | if (!p->numa_group) | 921 | if (!p->numa_group) |
931 | return 0; | 922 | return 0; |
932 | 923 | ||
933 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | 924 | return p->numa_group->faults[task_faults_idx(nid, 0)] + |
925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | ||
934 | } | 926 | } |
935 | 927 | ||
936 | /* | 928 | /* |
@@ -1023,7 +1015,7 @@ struct task_numa_env { | |||
1023 | 1015 | ||
1024 | struct numa_stats src_stats, dst_stats; | 1016 | struct numa_stats src_stats, dst_stats; |
1025 | 1017 | ||
1026 | int imbalance_pct, idx; | 1018 | int imbalance_pct; |
1027 | 1019 | ||
1028 | struct task_struct *best_task; | 1020 | struct task_struct *best_task; |
1029 | long best_imp; | 1021 | long best_imp; |
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1211 | * elsewhere, so there is no point in (re)trying. | 1203 | * elsewhere, so there is no point in (re)trying. |
1212 | */ | 1204 | */ |
1213 | if (unlikely(!sd)) { | 1205 | if (unlikely(!sd)) { |
1214 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | 1206 | p->numa_preferred_nid = task_node(p); |
1215 | return -EINVAL; | 1207 | return -EINVAL; |
1216 | } | 1208 | } |
1217 | 1209 | ||
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) | |||
1258 | p->numa_scan_period = task_scan_min(p); | 1250 | p->numa_scan_period = task_scan_min(p); |
1259 | 1251 | ||
1260 | if (env.best_task == NULL) { | 1252 | if (env.best_task == NULL) { |
1261 | int ret = migrate_task_to(p, env.best_cpu); | 1253 | ret = migrate_task_to(p, env.best_cpu); |
1254 | if (ret != 0) | ||
1255 | trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); | ||
1262 | return ret; | 1256 | return ret; |
1263 | } | 1257 | } |
1264 | 1258 | ||
1265 | ret = migrate_swap(p, env.best_task); | 1259 | ret = migrate_swap(p, env.best_task); |
1260 | if (ret != 0) | ||
1261 | trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); | ||
1266 | put_task_struct(env.best_task); | 1262 | put_task_struct(env.best_task); |
1267 | return ret; | 1263 | return ret; |
1268 | } | 1264 | } |
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1278 | p->numa_migrate_retry = jiffies + HZ; | 1274 | p->numa_migrate_retry = jiffies + HZ; |
1279 | 1275 | ||
1280 | /* Success if task is already running on preferred CPU */ | 1276 | /* Success if task is already running on preferred CPU */ |
1281 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | 1277 | if (task_node(p) == p->numa_preferred_nid) |
1282 | return; | 1278 | return; |
1283 | 1279 | ||
1284 | /* Otherwise, try migrate to a CPU on the preferred node */ | 1280 | /* Otherwise, try migrate to a CPU on the preferred node */ |
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p, | |||
1350 | * scanning faster if shared accesses dominate as it may | 1346 | * scanning faster if shared accesses dominate as it may |
1351 | * simply bounce migrations uselessly | 1347 | * simply bounce migrations uselessly |
1352 | */ | 1348 | */ |
1353 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
1354 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1349 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); |
1355 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1350 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
1356 | } | 1351 | } |
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work) | |||
1762 | start = end; | 1757 | start = end; |
1763 | if (pages <= 0) | 1758 | if (pages <= 0) |
1764 | goto out; | 1759 | goto out; |
1760 | |||
1761 | cond_resched(); | ||
1765 | } while (end != vma->vm_end); | 1762 | } while (end != vma->vm_end); |
1766 | } | 1763 | } |
1767 | 1764 | ||
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2365 | } | 2362 | } |
2366 | wakeup = 0; | 2363 | wakeup = 0; |
2367 | } else { | 2364 | } else { |
2368 | /* | 2365 | __synchronize_entity_decay(se); |
2369 | * Task re-woke on same cpu (or else migrate_task_rq_fair() | ||
2370 | * would have made count negative); we must be careful to avoid | ||
2371 | * double-accounting blocked time after synchronizing decays. | ||
2372 | */ | ||
2373 | se->avg.last_runnable_update += __synchronize_entity_decay(se) | ||
2374 | << 20; | ||
2375 | } | 2366 | } |
2376 | 2367 | ||
2377 | /* migrated tasks did not contribute to our blocked load */ | 2368 | /* migrated tasks did not contribute to our blocked load */ |
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
3923 | { | 3914 | { |
3924 | struct sched_entity *se = tg->se[cpu]; | 3915 | struct sched_entity *se = tg->se[cpu]; |
3925 | 3916 | ||
3926 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ | 3917 | if (!tg->parent) /* the trivial, non-cgroup case */ |
3927 | return wl; | 3918 | return wl; |
3928 | 3919 | ||
3929 | for_each_sched_entity(se) { | 3920 | for_each_sched_entity(se) { |
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4101 | */ | 4092 | */ |
4102 | static struct sched_group * | 4093 | static struct sched_group * |
4103 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 4094 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
4104 | int this_cpu, int load_idx) | 4095 | int this_cpu, int sd_flag) |
4105 | { | 4096 | { |
4106 | struct sched_group *idlest = NULL, *group = sd->groups; | 4097 | struct sched_group *idlest = NULL, *group = sd->groups; |
4107 | unsigned long min_load = ULONG_MAX, this_load = 0; | 4098 | unsigned long min_load = ULONG_MAX, this_load = 0; |
4099 | int load_idx = sd->forkexec_idx; | ||
4108 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 4100 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
4109 | 4101 | ||
4102 | if (sd_flag & SD_BALANCE_WAKE) | ||
4103 | load_idx = sd->wake_idx; | ||
4104 | |||
4110 | do { | 4105 | do { |
4111 | unsigned long load, avg_load; | 4106 | unsigned long load, avg_load; |
4112 | int local_group; | 4107 | int local_group; |
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4274 | } | 4269 | } |
4275 | 4270 | ||
4276 | while (sd) { | 4271 | while (sd) { |
4277 | int load_idx = sd->forkexec_idx; | ||
4278 | struct sched_group *group; | 4272 | struct sched_group *group; |
4279 | int weight; | 4273 | int weight; |
4280 | 4274 | ||
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4283 | continue; | 4277 | continue; |
4284 | } | 4278 | } |
4285 | 4279 | ||
4286 | if (sd_flag & SD_BALANCE_WAKE) | 4280 | group = find_idlest_group(sd, p, cpu, sd_flag); |
4287 | load_idx = sd->wake_idx; | ||
4288 | |||
4289 | group = find_idlest_group(sd, p, cpu, load_idx); | ||
4290 | if (!group) { | 4281 | if (!group) { |
4291 | sd = sd->child; | 4282 | sd = sd->child; |
4292 | continue; | 4283 | continue; |
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5512 | struct sched_group *group, int load_idx, | 5503 | struct sched_group *group, int load_idx, |
5513 | int local_group, struct sg_lb_stats *sgs) | 5504 | int local_group, struct sg_lb_stats *sgs) |
5514 | { | 5505 | { |
5515 | unsigned long nr_running; | ||
5516 | unsigned long load; | 5506 | unsigned long load; |
5517 | int i; | 5507 | int i; |
5518 | 5508 | ||
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5521 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5511 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5522 | struct rq *rq = cpu_rq(i); | 5512 | struct rq *rq = cpu_rq(i); |
5523 | 5513 | ||
5524 | nr_running = rq->nr_running; | ||
5525 | |||
5526 | /* Bias balancing toward cpus of our domain */ | 5514 | /* Bias balancing toward cpus of our domain */ |
5527 | if (local_group) | 5515 | if (local_group) |
5528 | load = target_load(i, load_idx); | 5516 | load = target_load(i, load_idx); |
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5530 | load = source_load(i, load_idx); | 5518 | load = source_load(i, load_idx); |
5531 | 5519 | ||
5532 | sgs->group_load += load; | 5520 | sgs->group_load += load; |
5533 | sgs->sum_nr_running += nr_running; | 5521 | sgs->sum_nr_running += rq->nr_running; |
5534 | #ifdef CONFIG_NUMA_BALANCING | 5522 | #ifdef CONFIG_NUMA_BALANCING |
5535 | sgs->nr_numa_running += rq->nr_numa_running; | 5523 | sgs->nr_numa_running += rq->nr_numa_running; |
5536 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5524 | sgs->nr_preferred_running += rq->nr_preferred_running; |
@@ -6521,7 +6509,7 @@ static struct { | |||
6521 | unsigned long next_balance; /* in jiffy units */ | 6509 | unsigned long next_balance; /* in jiffy units */ |
6522 | } nohz ____cacheline_aligned; | 6510 | } nohz ____cacheline_aligned; |
6523 | 6511 | ||
6524 | static inline int find_new_ilb(int call_cpu) | 6512 | static inline int find_new_ilb(void) |
6525 | { | 6513 | { |
6526 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 6514 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
6527 | 6515 | ||
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu) | |||
6536 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | 6524 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle |
6537 | * CPU (if there is one). | 6525 | * CPU (if there is one). |
6538 | */ | 6526 | */ |
6539 | static void nohz_balancer_kick(int cpu) | 6527 | static void nohz_balancer_kick(void) |
6540 | { | 6528 | { |
6541 | int ilb_cpu; | 6529 | int ilb_cpu; |
6542 | 6530 | ||
6543 | nohz.next_balance++; | 6531 | nohz.next_balance++; |
6544 | 6532 | ||
6545 | ilb_cpu = find_new_ilb(cpu); | 6533 | ilb_cpu = find_new_ilb(); |
6546 | 6534 | ||
6547 | if (ilb_cpu >= nr_cpu_ids) | 6535 | if (ilb_cpu >= nr_cpu_ids) |
6548 | return; | 6536 | return; |
@@ -6652,10 +6640,10 @@ void update_max_interval(void) | |||
6652 | * | 6640 | * |
6653 | * Balancing parameters are set up in init_sched_domains. | 6641 | * Balancing parameters are set up in init_sched_domains. |
6654 | */ | 6642 | */ |
6655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 6643 | static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) |
6656 | { | 6644 | { |
6657 | int continue_balancing = 1; | 6645 | int continue_balancing = 1; |
6658 | struct rq *rq = cpu_rq(cpu); | 6646 | int cpu = rq->cpu; |
6659 | unsigned long interval; | 6647 | unsigned long interval; |
6660 | struct sched_domain *sd; | 6648 | struct sched_domain *sd; |
6661 | /* Earliest time when we have to do rebalance again */ | 6649 | /* Earliest time when we have to do rebalance again */ |
@@ -6752,9 +6740,9 @@ out: | |||
6752 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 6740 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
6753 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 6741 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
6754 | */ | 6742 | */ |
6755 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 6743 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) |
6756 | { | 6744 | { |
6757 | struct rq *this_rq = cpu_rq(this_cpu); | 6745 | int this_cpu = this_rq->cpu; |
6758 | struct rq *rq; | 6746 | struct rq *rq; |
6759 | int balance_cpu; | 6747 | int balance_cpu; |
6760 | 6748 | ||
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
6781 | update_idle_cpu_load(rq); | 6769 | update_idle_cpu_load(rq); |
6782 | raw_spin_unlock_irq(&rq->lock); | 6770 | raw_spin_unlock_irq(&rq->lock); |
6783 | 6771 | ||
6784 | rebalance_domains(balance_cpu, CPU_IDLE); | 6772 | rebalance_domains(rq, CPU_IDLE); |
6785 | 6773 | ||
6786 | if (time_after(this_rq->next_balance, rq->next_balance)) | 6774 | if (time_after(this_rq->next_balance, rq->next_balance)) |
6787 | this_rq->next_balance = rq->next_balance; | 6775 | this_rq->next_balance = rq->next_balance; |
@@ -6800,14 +6788,14 @@ end: | |||
6800 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 6788 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
6801 | * domain span are idle. | 6789 | * domain span are idle. |
6802 | */ | 6790 | */ |
6803 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 6791 | static inline int nohz_kick_needed(struct rq *rq) |
6804 | { | 6792 | { |
6805 | unsigned long now = jiffies; | 6793 | unsigned long now = jiffies; |
6806 | struct sched_domain *sd; | 6794 | struct sched_domain *sd; |
6807 | struct sched_group_power *sgp; | 6795 | struct sched_group_power *sgp; |
6808 | int nr_busy; | 6796 | int nr_busy, cpu = rq->cpu; |
6809 | 6797 | ||
6810 | if (unlikely(idle_cpu(cpu))) | 6798 | if (unlikely(rq->idle_balance)) |
6811 | return 0; | 6799 | return 0; |
6812 | 6800 | ||
6813 | /* | 6801 | /* |
@@ -6856,7 +6844,7 @@ need_kick: | |||
6856 | return 1; | 6844 | return 1; |
6857 | } | 6845 | } |
6858 | #else | 6846 | #else |
6859 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 6847 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
6860 | #endif | 6848 | #endif |
6861 | 6849 | ||
6862 | /* | 6850 | /* |
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | |||
6865 | */ | 6853 | */ |
6866 | static void run_rebalance_domains(struct softirq_action *h) | 6854 | static void run_rebalance_domains(struct softirq_action *h) |
6867 | { | 6855 | { |
6868 | int this_cpu = smp_processor_id(); | 6856 | struct rq *this_rq = this_rq(); |
6869 | struct rq *this_rq = cpu_rq(this_cpu); | ||
6870 | enum cpu_idle_type idle = this_rq->idle_balance ? | 6857 | enum cpu_idle_type idle = this_rq->idle_balance ? |
6871 | CPU_IDLE : CPU_NOT_IDLE; | 6858 | CPU_IDLE : CPU_NOT_IDLE; |
6872 | 6859 | ||
6873 | rebalance_domains(this_cpu, idle); | 6860 | rebalance_domains(this_rq, idle); |
6874 | 6861 | ||
6875 | /* | 6862 | /* |
6876 | * If this cpu has a pending nohz_balance_kick, then do the | 6863 | * If this cpu has a pending nohz_balance_kick, then do the |
6877 | * balancing on behalf of the other idle cpus whose ticks are | 6864 | * balancing on behalf of the other idle cpus whose ticks are |
6878 | * stopped. | 6865 | * stopped. |
6879 | */ | 6866 | */ |
6880 | nohz_idle_balance(this_cpu, idle); | 6867 | nohz_idle_balance(this_rq, idle); |
6881 | } | 6868 | } |
6882 | 6869 | ||
6883 | static inline int on_null_domain(int cpu) | 6870 | static inline int on_null_domain(struct rq *rq) |
6884 | { | 6871 | { |
6885 | return !rcu_dereference_sched(cpu_rq(cpu)->sd); | 6872 | return !rcu_dereference_sched(rq->sd); |
6886 | } | 6873 | } |
6887 | 6874 | ||
6888 | /* | 6875 | /* |
6889 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 6876 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
6890 | */ | 6877 | */ |
6891 | void trigger_load_balance(struct rq *rq, int cpu) | 6878 | void trigger_load_balance(struct rq *rq) |
6892 | { | 6879 | { |
6893 | /* Don't need to rebalance while attached to NULL domain */ | 6880 | /* Don't need to rebalance while attached to NULL domain */ |
6894 | if (time_after_eq(jiffies, rq->next_balance) && | 6881 | if (unlikely(on_null_domain(rq))) |
6895 | likely(!on_null_domain(cpu))) | 6882 | return; |
6883 | |||
6884 | if (time_after_eq(jiffies, rq->next_balance)) | ||
6896 | raise_softirq(SCHED_SOFTIRQ); | 6885 | raise_softirq(SCHED_SOFTIRQ); |
6897 | #ifdef CONFIG_NO_HZ_COMMON | 6886 | #ifdef CONFIG_NO_HZ_COMMON |
6898 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 6887 | if (nohz_kick_needed(rq)) |
6899 | nohz_balancer_kick(cpu); | 6888 | nohz_balancer_kick(); |
6900 | #endif | 6889 | #endif |
6901 | } | 6890 | } |
6902 | 6891 | ||
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7012 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 7001 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
7013 | 7002 | ||
7014 | /* | 7003 | /* |
7015 | * Ensure the task's vruntime is normalized, so that when its | 7004 | * Ensure the task's vruntime is normalized, so that when it's |
7016 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7005 | * switched back to the fair class the enqueue_entity(.flags=0) will |
7017 | * do the right thing. | 7006 | * do the right thing. |
7018 | * | 7007 | * |
7019 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | 7008 | * If it's on_rq, then the dequeue_entity(.flags=0) will already |
7020 | * have normalized the vruntime, if it was !on_rq, then only when | 7009 | * have normalized the vruntime, if it's !on_rq, then only when |
7021 | * the task is sleeping will it still have non-normalized vruntime. | 7010 | * the task is sleeping will it still have non-normalized vruntime. |
7022 | */ | 7011 | */ |
7023 | if (!se->on_rq && p->state != TASK_RUNNING) { | 7012 | if (!p->on_rq && p->state != TASK_RUNNING) { |
7024 | /* | 7013 | /* |
7025 | * Fix up our vruntime so that the current sleep doesn't | 7014 | * Fix up our vruntime so that the current sleep doesn't |
7026 | * cause 'unlimited' sleep bonus. | 7015 | * cause 'unlimited' sleep bonus. |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..1999021042c7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
538 | 538 | ||
539 | #endif /* CONFIG_RT_GROUP_SCHED */ | 539 | #endif /* CONFIG_RT_GROUP_SCHED */ |
540 | 540 | ||
541 | bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) | ||
542 | { | ||
543 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
544 | |||
545 | return (hrtimer_active(&rt_b->rt_period_timer) || | ||
546 | rt_rq->rt_time < rt_b->rt_runtime); | ||
547 | } | ||
548 | |||
541 | #ifdef CONFIG_SMP | 549 | #ifdef CONFIG_SMP |
542 | /* | 550 | /* |
543 | * We ran out of runtime, see if we can borrow some from our neighbours. | 551 | * We ran out of runtime, see if we can borrow some from our neighbours. |
@@ -1738,7 +1746,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1738 | !test_tsk_need_resched(rq->curr) && | 1746 | !test_tsk_need_resched(rq->curr) && |
1739 | has_pushable_tasks(rq) && | 1747 | has_pushable_tasks(rq) && |
1740 | p->nr_cpus_allowed > 1 && | 1748 | p->nr_cpus_allowed > 1 && |
1741 | rt_task(rq->curr) && | 1749 | (dl_task(rq->curr) || rt_task(rq->curr)) && |
1742 | (rq->curr->nr_cpus_allowed < 2 || | 1750 | (rq->curr->nr_cpus_allowed < 2 || |
1743 | rq->curr->prio <= p->prio)) | 1751 | rq->curr->prio <= p->prio)) |
1744 | push_rt_tasks(rq); | 1752 | push_rt_tasks(rq); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..f964add50f38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/sched/deadline.h> | ||
5 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
6 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
@@ -9,6 +10,7 @@ | |||
9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
10 | 11 | ||
11 | #include "cpupri.h" | 12 | #include "cpupri.h" |
13 | #include "cpudeadline.h" | ||
12 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
13 | 15 | ||
14 | struct rq; | 16 | struct rq; |
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
73 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 75 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
74 | 76 | ||
75 | /* | 77 | /* |
78 | * Single value that decides SCHED_DEADLINE internal math precision. | ||
79 | * 10 -> just above 1us | ||
80 | * 9 -> just above 0.5us | ||
81 | */ | ||
82 | #define DL_SCALE (10) | ||
83 | |||
84 | /* | ||
76 | * These are the 'tuning knobs' of the scheduler: | 85 | * These are the 'tuning knobs' of the scheduler: |
77 | */ | 86 | */ |
78 | 87 | ||
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
81 | */ | 90 | */ |
82 | #define RUNTIME_INF ((u64)~0ULL) | 91 | #define RUNTIME_INF ((u64)~0ULL) |
83 | 92 | ||
93 | static inline int fair_policy(int policy) | ||
94 | { | ||
95 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | ||
96 | } | ||
97 | |||
84 | static inline int rt_policy(int policy) | 98 | static inline int rt_policy(int policy) |
85 | { | 99 | { |
86 | if (policy == SCHED_FIFO || policy == SCHED_RR) | 100 | return policy == SCHED_FIFO || policy == SCHED_RR; |
87 | return 1; | 101 | } |
88 | return 0; | 102 | |
103 | static inline int dl_policy(int policy) | ||
104 | { | ||
105 | return policy == SCHED_DEADLINE; | ||
89 | } | 106 | } |
90 | 107 | ||
91 | static inline int task_has_rt_policy(struct task_struct *p) | 108 | static inline int task_has_rt_policy(struct task_struct *p) |
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p) | |||
93 | return rt_policy(p->policy); | 110 | return rt_policy(p->policy); |
94 | } | 111 | } |
95 | 112 | ||
113 | static inline int task_has_dl_policy(struct task_struct *p) | ||
114 | { | ||
115 | return dl_policy(p->policy); | ||
116 | } | ||
117 | |||
118 | static inline bool dl_time_before(u64 a, u64 b) | ||
119 | { | ||
120 | return (s64)(a - b) < 0; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Tells if entity @a should preempt entity @b. | ||
125 | */ | ||
126 | static inline bool | ||
127 | dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) | ||
128 | { | ||
129 | return dl_time_before(a->deadline, b->deadline); | ||
130 | } | ||
131 | |||
96 | /* | 132 | /* |
97 | * This is the priority-queue data structure of the RT scheduling class: | 133 | * This is the priority-queue data structure of the RT scheduling class: |
98 | */ | 134 | */ |
@@ -108,6 +144,47 @@ struct rt_bandwidth { | |||
108 | u64 rt_runtime; | 144 | u64 rt_runtime; |
109 | struct hrtimer rt_period_timer; | 145 | struct hrtimer rt_period_timer; |
110 | }; | 146 | }; |
147 | /* | ||
148 | * To keep the bandwidth of -deadline tasks and groups under control | ||
149 | * we need some place where: | ||
150 | * - store the maximum -deadline bandwidth of the system (the group); | ||
151 | * - cache the fraction of that bandwidth that is currently allocated. | ||
152 | * | ||
153 | * This is all done in the data structure below. It is similar to the | ||
154 | * one used for RT-throttling (rt_bandwidth), with the main difference | ||
155 | * that, since here we are only interested in admission control, we | ||
156 | * do not decrease any runtime while the group "executes", neither we | ||
157 | * need a timer to replenish it. | ||
158 | * | ||
159 | * With respect to SMP, the bandwidth is given on a per-CPU basis, | ||
160 | * meaning that: | ||
161 | * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; | ||
162 | * - dl_total_bw array contains, in the i-eth element, the currently | ||
163 | * allocated bandwidth on the i-eth CPU. | ||
164 | * Moreover, groups consume bandwidth on each CPU, while tasks only | ||
165 | * consume bandwidth on the CPU they're running on. | ||
166 | * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw | ||
167 | * that will be shown the next time the proc or cgroup controls will | ||
168 | * be red. It on its turn can be changed by writing on its own | ||
169 | * control. | ||
170 | */ | ||
171 | struct dl_bandwidth { | ||
172 | raw_spinlock_t dl_runtime_lock; | ||
173 | u64 dl_runtime; | ||
174 | u64 dl_period; | ||
175 | }; | ||
176 | |||
177 | static inline int dl_bandwidth_enabled(void) | ||
178 | { | ||
179 | return sysctl_sched_rt_runtime >= 0; | ||
180 | } | ||
181 | |||
182 | extern struct dl_bw *dl_bw_of(int i); | ||
183 | |||
184 | struct dl_bw { | ||
185 | raw_spinlock_t lock; | ||
186 | u64 bw, total_bw; | ||
187 | }; | ||
111 | 188 | ||
112 | extern struct mutex sched_domains_mutex; | 189 | extern struct mutex sched_domains_mutex; |
113 | 190 | ||
@@ -364,6 +441,41 @@ struct rt_rq { | |||
364 | #endif | 441 | #endif |
365 | }; | 442 | }; |
366 | 443 | ||
444 | /* Deadline class' related fields in a runqueue */ | ||
445 | struct dl_rq { | ||
446 | /* runqueue is an rbtree, ordered by deadline */ | ||
447 | struct rb_root rb_root; | ||
448 | struct rb_node *rb_leftmost; | ||
449 | |||
450 | unsigned long dl_nr_running; | ||
451 | |||
452 | #ifdef CONFIG_SMP | ||
453 | /* | ||
454 | * Deadline values of the currently executing and the | ||
455 | * earliest ready task on this rq. Caching these facilitates | ||
456 | * the decision wether or not a ready but not running task | ||
457 | * should migrate somewhere else. | ||
458 | */ | ||
459 | struct { | ||
460 | u64 curr; | ||
461 | u64 next; | ||
462 | } earliest_dl; | ||
463 | |||
464 | unsigned long dl_nr_migratory; | ||
465 | int overloaded; | ||
466 | |||
467 | /* | ||
468 | * Tasks on this rq that can be pushed away. They are kept in | ||
469 | * an rb-tree, ordered by tasks' deadlines, with caching | ||
470 | * of the leftmost (earliest deadline) element. | ||
471 | */ | ||
472 | struct rb_root pushable_dl_tasks_root; | ||
473 | struct rb_node *pushable_dl_tasks_leftmost; | ||
474 | #else | ||
475 | struct dl_bw dl_bw; | ||
476 | #endif | ||
477 | }; | ||
478 | |||
367 | #ifdef CONFIG_SMP | 479 | #ifdef CONFIG_SMP |
368 | 480 | ||
369 | /* | 481 | /* |
@@ -382,6 +494,15 @@ struct root_domain { | |||
382 | cpumask_var_t online; | 494 | cpumask_var_t online; |
383 | 495 | ||
384 | /* | 496 | /* |
497 | * The bit corresponding to a CPU gets set here if such CPU has more | ||
498 | * than one runnable -deadline task (as it is below for RT tasks). | ||
499 | */ | ||
500 | cpumask_var_t dlo_mask; | ||
501 | atomic_t dlo_count; | ||
502 | struct dl_bw dl_bw; | ||
503 | struct cpudl cpudl; | ||
504 | |||
505 | /* | ||
385 | * The "RT overload" flag: it gets set if a CPU has more than | 506 | * The "RT overload" flag: it gets set if a CPU has more than |
386 | * one runnable RT task. | 507 | * one runnable RT task. |
387 | */ | 508 | */ |
@@ -432,6 +553,7 @@ struct rq { | |||
432 | 553 | ||
433 | struct cfs_rq cfs; | 554 | struct cfs_rq cfs; |
434 | struct rt_rq rt; | 555 | struct rt_rq rt; |
556 | struct dl_rq dl; | ||
435 | 557 | ||
436 | #ifdef CONFIG_FAIR_GROUP_SCHED | 558 | #ifdef CONFIG_FAIR_GROUP_SCHED |
437 | /* list of leaf cfs_rq on this cpu: */ | 559 | /* list of leaf cfs_rq on this cpu: */ |
@@ -827,8 +949,6 @@ static inline u64 global_rt_runtime(void) | |||
827 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 949 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
828 | } | 950 | } |
829 | 951 | ||
830 | |||
831 | |||
832 | static inline int task_current(struct rq *rq, struct task_struct *p) | 952 | static inline int task_current(struct rq *rq, struct task_struct *p) |
833 | { | 953 | { |
834 | return rq->curr == p; | 954 | return rq->curr == p; |
@@ -988,6 +1108,7 @@ static const u32 prio_to_wmult[40] = { | |||
988 | #else | 1108 | #else |
989 | #define ENQUEUE_WAKING 0 | 1109 | #define ENQUEUE_WAKING 0 |
990 | #endif | 1110 | #endif |
1111 | #define ENQUEUE_REPLENISH 8 | ||
991 | 1112 | ||
992 | #define DEQUEUE_SLEEP 1 | 1113 | #define DEQUEUE_SLEEP 1 |
993 | 1114 | ||
@@ -1023,6 +1144,7 @@ struct sched_class { | |||
1023 | void (*set_curr_task) (struct rq *rq); | 1144 | void (*set_curr_task) (struct rq *rq); |
1024 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1145 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
1025 | void (*task_fork) (struct task_struct *p); | 1146 | void (*task_fork) (struct task_struct *p); |
1147 | void (*task_dead) (struct task_struct *p); | ||
1026 | 1148 | ||
1027 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1149 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
1028 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1150 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
@@ -1042,6 +1164,7 @@ struct sched_class { | |||
1042 | for (class = sched_class_highest; class; class = class->next) | 1164 | for (class = sched_class_highest; class; class = class->next) |
1043 | 1165 | ||
1044 | extern const struct sched_class stop_sched_class; | 1166 | extern const struct sched_class stop_sched_class; |
1167 | extern const struct sched_class dl_sched_class; | ||
1045 | extern const struct sched_class rt_sched_class; | 1168 | extern const struct sched_class rt_sched_class; |
1046 | extern const struct sched_class fair_sched_class; | 1169 | extern const struct sched_class fair_sched_class; |
1047 | extern const struct sched_class idle_sched_class; | 1170 | extern const struct sched_class idle_sched_class; |
@@ -1051,7 +1174,7 @@ extern const struct sched_class idle_sched_class; | |||
1051 | 1174 | ||
1052 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1175 | extern void update_group_power(struct sched_domain *sd, int cpu); |
1053 | 1176 | ||
1054 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1177 | extern void trigger_load_balance(struct rq *rq); |
1055 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1178 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
1056 | 1179 | ||
1057 | extern void idle_enter_fair(struct rq *this_rq); | 1180 | extern void idle_enter_fair(struct rq *this_rq); |
@@ -1068,8 +1191,11 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
1068 | extern void sysrq_sched_debug_show(void); | 1191 | extern void sysrq_sched_debug_show(void); |
1069 | extern void sched_init_granularity(void); | 1192 | extern void sched_init_granularity(void); |
1070 | extern void update_max_interval(void); | 1193 | extern void update_max_interval(void); |
1194 | |||
1195 | extern void init_sched_dl_class(void); | ||
1071 | extern void init_sched_rt_class(void); | 1196 | extern void init_sched_rt_class(void); |
1072 | extern void init_sched_fair_class(void); | 1197 | extern void init_sched_fair_class(void); |
1198 | extern void init_sched_dl_class(void); | ||
1073 | 1199 | ||
1074 | extern void resched_task(struct task_struct *p); | 1200 | extern void resched_task(struct task_struct *p); |
1075 | extern void resched_cpu(int cpu); | 1201 | extern void resched_cpu(int cpu); |
@@ -1077,6 +1203,12 @@ extern void resched_cpu(int cpu); | |||
1077 | extern struct rt_bandwidth def_rt_bandwidth; | 1203 | extern struct rt_bandwidth def_rt_bandwidth; |
1078 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 1204 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
1079 | 1205 | ||
1206 | extern struct dl_bandwidth def_dl_bandwidth; | ||
1207 | extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); | ||
1208 | extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | ||
1209 | |||
1210 | unsigned long to_ratio(u64 period, u64 runtime); | ||
1211 | |||
1080 | extern void update_idle_cpu_load(struct rq *this_rq); | 1212 | extern void update_idle_cpu_load(struct rq *this_rq); |
1081 | 1213 | ||
1082 | extern void init_task_runnable_average(struct task_struct *p); | 1214 | extern void init_task_runnable_average(struct task_struct *p); |
@@ -1353,6 +1485,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1353 | 1485 | ||
1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1486 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1487 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
1488 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | ||
1356 | 1489 | ||
1357 | extern void cfs_bandwidth_usage_inc(void); | 1490 | extern void cfs_bandwidth_usage_inc(void); |
1358 | extern void cfs_bandwidth_usage_dec(void); | 1491 | extern void cfs_bandwidth_usage_dec(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
103 | * Simple, special scheduling class for the per-CPU stop tasks: | 103 | * Simple, special scheduling class for the per-CPU stop tasks: |
104 | */ | 104 | */ |
105 | const struct sched_class stop_sched_class = { | 105 | const struct sched_class stop_sched_class = { |
106 | .next = &rt_sched_class, | 106 | .next = &dl_sched_class, |
107 | 107 | ||
108 | .enqueue_task = enqueue_task_stop, | 108 | .enqueue_task = enqueue_task_stop, |
109 | .dequeue_task = dequeue_task_stop, | 109 | .dequeue_task = dequeue_task_stop, |