aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-20 13:42:08 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-20 13:42:08 -0500
commita0fa1dd3cdbccec9597fe53b6177a9aa6e20f2f8 (patch)
treeb249854573815eedf377e554f0ea516f86411841
parent9326657abe1a83ed4b4f396b923ca1217fd50cba (diff)
parenteaad45132c564ce377e6dce05e78e08e456d5315 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: - Add the initial implementation of SCHED_DEADLINE support: a real-time scheduling policy where tasks that meet their deadlines and periodically execute their instances in less than their runtime quota see real-time scheduling and won't miss any of their deadlines. Tasks that go over their quota get delayed (Available to privileged users for now) - Clean up and fix preempt_enable_no_resched() abuse all around the tree - Do sched_clock() performance optimizations on x86 and elsewhere - Fix and improve auto-NUMA balancing - Fix and clean up the idle loop - Apply various cleanups and fixes * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits) sched: Fix __sched_setscheduler() nice test sched: Move SCHED_RESET_ON_FORK into attr::sched_flags sched: Fix up attr::sched_priority warning sched: Fix up scheduler syscall LTP fails sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls sched/core: Fix htmldocs warnings sched/deadline: No need to check p if dl_se is valid sched/deadline: Remove unused variables sched/deadline: Fix sparse static warnings m68k: Fix build warning in mac_via.h sched, thermal: Clean up preempt_enable_no_resched() abuse sched, net: Fixup busy_loop_us_clock() sched, net: Clean up preempt_enable_no_resched() abuse sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding sched/preempt, locking: Rework local_bh_{dis,en}able() sched/clock, x86: Avoid a runtime condition in native_sched_clock() sched/clock: Fix up clear_sched_clock_stable() sched/clock, x86: Use a static_key for sched_clock_stable sched/clock: Remove local_irq_disable() from the clocks sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs ...
-rw-r--r--Documentation/sysctl/kernel.txt5
-rw-r--r--arch/arm/include/asm/unistd.h2
-rw-r--r--arch/arm/include/uapi/asm/unistd.h2
-rw-r--r--arch/arm/kernel/calls.S2
-rw-r--r--arch/m68k/include/asm/mac_via.h2
-rw-r--r--arch/x86/include/asm/mwait.h43
-rw-r--r--arch/x86/include/asm/processor.h23
-rw-r--r--arch/x86/include/asm/timer.h77
-rw-r--r--arch/x86/kernel/acpi/cstate.c23
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c16
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/tsc.c318
-rw-r--r--arch/x86/platform/uv/tlb_uv.c66
-rw-r--r--arch/x86/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--drivers/acpi/acpi_pad.c5
-rw-r--r--drivers/acpi/processor_idle.c15
-rw-r--r--drivers/idle/intel_idle.c11
-rw-r--r--drivers/thermal/intel_powerclamp.c6
-rw-r--r--include/linux/bottom_half.h32
-rw-r--r--include/linux/hardirq.h1
-rw-r--r--include/linux/init_task.h10
-rw-r--r--include/linux/preempt.h37
-rw-r--r--include/linux/preempt_mask.h16
-rw-r--r--include/linux/rtmutex.h18
-rw-r--r--include/linux/rwlock_api_smp.h12
-rw-r--r--include/linux/sched.h141
-rw-r--r--include/linux/sched/deadline.h24
-rw-r--r--include/linux/sched/rt.h5
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/spinlock_api_smp.h12
-rw-r--r--include/linux/spinlock_api_up.h16
-rw-r--r--include/linux/syscalls.h6
-rw-r--r--include/linux/uaccess.h5
-rw-r--r--include/net/busy_poll.h19
-rw-r--r--include/uapi/linux/sched.h6
-rw-r--r--kernel/cpu/idle.c17
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/locking/rtmutex-debug.c8
-rw-r--r--kernel/locking/rtmutex.c166
-rw-r--r--kernel/locking/rtmutex_common.h23
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/clock.c78
-rw-r--r--kernel/sched/core.c822
-rw-r--r--kernel/sched/cpudeadline.c216
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/deadline.c1640
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c83
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h146
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/softirq.c39
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c65
-rw-r--r--kernel/trace/trace_selftest.c33
-rw-r--r--net/ipv4/tcp.c4
63 files changed, 3775 insertions, 626 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 26b7ee491df8..6d486404200e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -428,11 +428,6 @@ rate for each task.
428numa_balancing_scan_size_mb is how many megabytes worth of pages are 428numa_balancing_scan_size_mb is how many megabytes worth of pages are
429scanned for a given scan. 429scanned for a given scan.
430 430
431numa_balancing_settle_count is how many scan periods must complete before
432the schedule balancer stops pushing the task towards a preferred node. This
433gives the scheduler a chance to place the task on an alternative node if the
434preferred node is overloaded.
435
436numa_balancing_migrate_deferred is how many page migrations get skipped 431numa_balancing_migrate_deferred is how many page migrations get skipped
437unconditionally, after a page migration is skipped because a page is shared 432unconditionally, after a page migration is skipped because a page is shared
438with other tasks. This reduces page migration overhead, and determines 433with other tasks. This reduces page migration overhead, and determines
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 141baa3f9a72..acabef1a75df 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@
15 15
16#include <uapi/asm/unistd.h> 16#include <uapi/asm/unistd.h>
17 17
18#define __NR_syscalls (380) 18#define __NR_syscalls (384)
19#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) 19#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
20 20
21#define __ARCH_WANT_STAT64 21#define __ARCH_WANT_STAT64
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index af33b44990ed..fb5584d0cc05 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -406,6 +406,8 @@
406#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) 406#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
407#define __NR_kcmp (__NR_SYSCALL_BASE+378) 407#define __NR_kcmp (__NR_SYSCALL_BASE+378)
408#define __NR_finit_module (__NR_SYSCALL_BASE+379) 408#define __NR_finit_module (__NR_SYSCALL_BASE+379)
409#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
410#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
409 411
410/* 412/*
411 * This may need to be greater than __NR_last_syscall+1 in order to 413 * This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index c6ca7e376773..166e945de832 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,8 @@
389 CALL(sys_process_vm_writev) 389 CALL(sys_process_vm_writev)
390 CALL(sys_kcmp) 390 CALL(sys_kcmp)
391 CALL(sys_finit_module) 391 CALL(sys_finit_module)
392/* 380 */ CALL(sys_sched_setattr)
393 CALL(sys_sched_getattr)
392#ifndef syscalls_counted 394#ifndef syscalls_counted
393.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls 395.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
394#define syscalls_counted 396#define syscalls_counted
diff --git a/arch/m68k/include/asm/mac_via.h b/arch/m68k/include/asm/mac_via.h
index aeeedf8b2d25..fe3fc9ae1b69 100644
--- a/arch/m68k/include/asm/mac_via.h
+++ b/arch/m68k/include/asm/mac_via.h
@@ -254,6 +254,8 @@
254extern volatile __u8 *via1,*via2; 254extern volatile __u8 *via1,*via2;
255extern int rbv_present,via_alt_mapping; 255extern int rbv_present,via_alt_mapping;
256 256
257struct irq_desc;
258
257extern void via_register_interrupts(void); 259extern void via_register_interrupts(void);
258extern void via_irq_enable(int); 260extern void via_irq_enable(int);
259extern void via_irq_disable(int); 261extern void via_irq_disable(int);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 2f366d0ac6b4..1da25a5f96f9 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_MWAIT_H 1#ifndef _ASM_X86_MWAIT_H
2#define _ASM_X86_MWAIT_H 2#define _ASM_X86_MWAIT_H
3 3
4#include <linux/sched.h>
5
4#define MWAIT_SUBSTATE_MASK 0xf 6#define MWAIT_SUBSTATE_MASK 0xf
5#define MWAIT_CSTATE_MASK 0xf 7#define MWAIT_CSTATE_MASK 0xf
6#define MWAIT_SUBSTATE_SIZE 4 8#define MWAIT_SUBSTATE_SIZE 4
@@ -13,4 +15,45 @@
13 15
14#define MWAIT_ECX_INTERRUPT_BREAK 0x1 16#define MWAIT_ECX_INTERRUPT_BREAK 0x1
15 17
18static inline void __monitor(const void *eax, unsigned long ecx,
19 unsigned long edx)
20{
21 /* "monitor %eax, %ecx, %edx;" */
22 asm volatile(".byte 0x0f, 0x01, 0xc8;"
23 :: "a" (eax), "c" (ecx), "d"(edx));
24}
25
26static inline void __mwait(unsigned long eax, unsigned long ecx)
27{
28 /* "mwait %eax, %ecx;" */
29 asm volatile(".byte 0x0f, 0x01, 0xc9;"
30 :: "a" (eax), "c" (ecx));
31}
32
33/*
34 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
35 * which can obviate IPI to trigger checking of need_resched.
36 * We execute MONITOR against need_resched and enter optimized wait state
37 * through MWAIT. Whenever someone changes need_resched, we would be woken
38 * up from MWAIT (without an IPI).
39 *
40 * New with Core Duo processors, MWAIT can take some hints based on CPU
41 * capability.
42 */
43static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
44{
45 if (!current_set_polling_and_test()) {
46 if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
47 mb();
48 clflush((void *)&current_thread_info()->flags);
49 mb();
50 }
51
52 __monitor((void *)&current_thread_info()->flags, 0, 0);
53 if (!need_resched())
54 __mwait(eax, ecx);
55 }
56 current_clr_polling();
57}
58
16#endif /* _ASM_X86_MWAIT_H */ 59#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4057f9..24821f5768bc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -700,29 +700,6 @@ static inline void sync_core(void)
700#endif 700#endif
701} 701}
702 702
703static inline void __monitor(const void *eax, unsigned long ecx,
704 unsigned long edx)
705{
706 /* "monitor %eax, %ecx, %edx;" */
707 asm volatile(".byte 0x0f, 0x01, 0xc8;"
708 :: "a" (eax), "c" (ecx), "d"(edx));
709}
710
711static inline void __mwait(unsigned long eax, unsigned long ecx)
712{
713 /* "mwait %eax, %ecx;" */
714 asm volatile(".byte 0x0f, 0x01, 0xc9;"
715 :: "a" (eax), "c" (ecx));
716}
717
718static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
719{
720 trace_hardirqs_on();
721 /* "mwait %eax, %ecx;" */
722 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
723 :: "a" (eax), "c" (ecx));
724}
725
726extern void select_idle_routine(const struct cpuinfo_x86 *c); 703extern void select_idle_routine(const struct cpuinfo_x86 *c);
727extern void init_amd_e400_c1e_mask(void); 704extern void init_amd_e400_c1e_mask(void);
728 705
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 34baa0eb5d0c..3de54ef0aea5 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -4,6 +4,7 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/percpu.h> 5#include <linux/percpu.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/math64.h>
7 8
8#define TICK_SIZE (tick_nsec / 1000) 9#define TICK_SIZE (tick_nsec / 1000)
9 10
@@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void);
12 13
13extern int no_timer_check; 14extern int no_timer_check;
14 15
15/* Accelerators for sched_clock() 16/*
16 * convert from cycles(64bits) => nanoseconds (64bits) 17 * We use the full linear equation: f(x) = a + b*x, in order to allow
17 * basic equation: 18 * a continuous function in the face of dynamic freq changes.
18 * ns = cycles / (freq / ns_per_sec)
19 * ns = cycles * (ns_per_sec / freq)
20 * ns = cycles * (10^9 / (cpu_khz * 10^3))
21 * ns = cycles * (10^6 / cpu_khz)
22 * 19 *
23 * Then we use scaling math (suggested by george@mvista.com) to get: 20 * Continuity means that when our frequency changes our slope (b); we want to
24 * ns = cycles * (10^6 * SC / cpu_khz) / SC 21 * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
25 * ns = cycles * cyc2ns_scale / SC
26 * 22 *
27 * And since SC is a constant power of two, we can convert the div 23 * Without an offset (a) the above would not be possible.
28 * into a shift.
29 * 24 *
30 * We can use khz divisor instead of mhz to keep a better precision, since 25 * See the comment near cycles_2_ns() for details on how we compute (b).
31 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
32 * (mathieu.desnoyers@polymtl.ca)
33 *
34 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
35 *
36 * In:
37 *
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * Although we may still have enough bits to store the value of ns,
41 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
42 * leading to an incorrect result.
43 *
44 * To avoid this, we can decompose 'cycles' into quotient and remainder
45 * of division by SC. Then,
46 *
47 * ns = (quot * SC + rem) * cyc2ns_scale / SC
48 * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
49 *
50 * - sqazi@google.com
51 */ 26 */
52 27struct cyc2ns_data {
53DECLARE_PER_CPU(unsigned long, cyc2ns); 28 u32 cyc2ns_mul;
54DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); 29 u32 cyc2ns_shift;
55 30 u64 cyc2ns_offset;
56#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 31 u32 __count;
57 32 /* u32 hole */
58static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 33}; /* 24 bytes -- do not grow */
59{ 34
60 int cpu = smp_processor_id(); 35extern struct cyc2ns_data *cyc2ns_read_begin(void);
61 unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 36extern void cyc2ns_read_end(struct cyc2ns_data *);
62 ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
63 (1UL << CYC2NS_SCALE_FACTOR));
64 return ns;
65}
66
67static inline unsigned long long cycles_2_ns(unsigned long long cyc)
68{
69 unsigned long long ns;
70 unsigned long flags;
71
72 local_irq_save(flags);
73 ns = __cycles_2_ns(cyc);
74 local_irq_restore(flags);
75
76 return ns;
77}
78 37
79#endif /* _ASM_X86_TIMER_H */ 38#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781bc..e69182fd01cf 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
150} 150}
151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
152 152
153/*
154 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
155 * which can obviate IPI to trigger checking of need_resched.
156 * We execute MONITOR against need_resched and enter optimized wait state
157 * through MWAIT. Whenever someone changes need_resched, we would be woken
158 * up from MWAIT (without an IPI).
159 *
160 * New with Core Duo processors, MWAIT can take some hints based on CPU
161 * capability.
162 */
163void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
164{
165 if (!need_resched()) {
166 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
167 clflush((void *)&current_thread_info()->flags);
168
169 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb();
171 if (!need_resched())
172 __mwait(ax, cx);
173 }
174}
175
176void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 153void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
177{ 154{
178 unsigned int cpu = smp_processor_id(); 155 unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023bdd6b2..8bc79cddd9a2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
487 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 487 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
488 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 488 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
489 if (!check_tsc_unstable()) 489 if (!check_tsc_unstable())
490 sched_clock_stable = 1; 490 set_sched_clock_stable();
491 } 491 }
492 492
493#ifdef CONFIG_X86_64 493#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ea04b342c026..1a439c047ff3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
93 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 93 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
94 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 94 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
95 if (!check_tsc_unstable()) 95 if (!check_tsc_unstable())
96 sched_clock_stable = 1; 96 set_sched_clock_stable();
97 } 97 }
98 98
99 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ 99 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e132931614d..b88645191fe5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,21 +1883,27 @@ static struct pmu pmu = {
1883 1883
1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1885{ 1885{
1886 struct cyc2ns_data *data;
1887
1886 userpg->cap_user_time = 0; 1888 userpg->cap_user_time = 0;
1887 userpg->cap_user_time_zero = 0; 1889 userpg->cap_user_time_zero = 0;
1888 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; 1890 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
1889 userpg->pmc_width = x86_pmu.cntval_bits; 1891 userpg->pmc_width = x86_pmu.cntval_bits;
1890 1892
1891 if (!sched_clock_stable) 1893 if (!sched_clock_stable())
1892 return; 1894 return;
1893 1895
1896 data = cyc2ns_read_begin();
1897
1894 userpg->cap_user_time = 1; 1898 userpg->cap_user_time = 1;
1895 userpg->time_mult = this_cpu_read(cyc2ns); 1899 userpg->time_mult = data->cyc2ns_mul;
1896 userpg->time_shift = CYC2NS_SCALE_FACTOR; 1900 userpg->time_shift = data->cyc2ns_shift;
1897 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1901 userpg->time_offset = data->cyc2ns_offset - now;
1898 1902
1899 userpg->cap_user_time_zero = 1; 1903 userpg->cap_user_time_zero = 1;
1900 userpg->time_zero = this_cpu_read(cyc2ns_offset); 1904 userpg->time_zero = data->cyc2ns_offset;
1905
1906 cyc2ns_read_end(data);
1901} 1907}
1902 1908
1903/* 1909/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a3aa02..f5252c4eec8c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
1417 * The WBINVD is insufficient due to the spurious-wakeup 1417 * The WBINVD is insufficient due to the spurious-wakeup
1418 * case where we return around the loop. 1418 * case where we return around the loop.
1419 */ 1419 */
1420 mb();
1420 clflush(mwait_ptr); 1421 clflush(mwait_ptr);
1422 mb();
1421 __monitor(mwait_ptr, 0, 0); 1423 __monitor(mwait_ptr, 0, 0);
1422 mb(); 1424 mb();
1423 __mwait(eax, 0); 1425 __mwait(eax, 0);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..6377fb28b958 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
11#include <linux/clocksource.h> 11#include <linux/clocksource.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/timex.h> 13#include <linux/timex.h>
14#include <linux/static_key.h>
14 15
15#include <asm/hpet.h> 16#include <asm/hpet.h>
16#include <asm/timer.h> 17#include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
37 erroneous rdtsc usage on !cpu_has_tsc processors */ 38 erroneous rdtsc usage on !cpu_has_tsc processors */
38static int __read_mostly tsc_disabled = -1; 39static int __read_mostly tsc_disabled = -1;
39 40
41static struct static_key __use_tsc = STATIC_KEY_INIT;
42
40int tsc_clocksource_reliable; 43int tsc_clocksource_reliable;
44
45/*
46 * Use a ring-buffer like data structure, where a writer advances the head by
47 * writing a new data entry and a reader advances the tail when it observes a
48 * new entry.
49 *
50 * Writers are made to wait on readers until there's space to write a new
51 * entry.
52 *
53 * This means that we can always use an {offset, mul} pair to compute a ns
54 * value that is 'roughly' in the right direction, even if we're writing a new
55 * {offset, mul} pair during the clock read.
56 *
57 * The down-side is that we can no longer guarantee strict monotonicity anymore
58 * (assuming the TSC was that to begin with), because while we compute the
59 * intersection point of the two clock slopes and make sure the time is
60 * continuous at the point of switching; we can no longer guarantee a reader is
61 * strictly before or after the switch point.
62 *
63 * It does mean a reader no longer needs to disable IRQs in order to avoid
64 * CPU-Freq updates messing with his times, and similarly an NMI reader will
65 * no longer run the risk of hitting half-written state.
66 */
67
68struct cyc2ns {
69 struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
70 struct cyc2ns_data *head; /* 48 + 8 = 56 */
71 struct cyc2ns_data *tail; /* 56 + 8 = 64 */
72}; /* exactly fits one cacheline */
73
74static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
75
76struct cyc2ns_data *cyc2ns_read_begin(void)
77{
78 struct cyc2ns_data *head;
79
80 preempt_disable();
81
82 head = this_cpu_read(cyc2ns.head);
83 /*
84 * Ensure we observe the entry when we observe the pointer to it.
85 * matches the wmb from cyc2ns_write_end().
86 */
87 smp_read_barrier_depends();
88 head->__count++;
89 barrier();
90
91 return head;
92}
93
94void cyc2ns_read_end(struct cyc2ns_data *head)
95{
96 barrier();
97 /*
98 * If we're the outer most nested read; update the tail pointer
99 * when we're done. This notifies possible pending writers
100 * that we've observed the head pointer and that the other
101 * entry is now free.
102 */
103 if (!--head->__count) {
104 /*
105 * x86-TSO does not reorder writes with older reads;
106 * therefore once this write becomes visible to another
107 * cpu, we must be finished reading the cyc2ns_data.
108 *
109 * matches with cyc2ns_write_begin().
110 */
111 this_cpu_write(cyc2ns.tail, head);
112 }
113 preempt_enable();
114}
115
116/*
117 * Begin writing a new @data entry for @cpu.
118 *
119 * Assumes some sort of write side lock; currently 'provided' by the assumption
120 * that cpufreq will call its notifiers sequentially.
121 */
122static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
123{
124 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
125 struct cyc2ns_data *data = c2n->data;
126
127 if (data == c2n->head)
128 data++;
129
130 /* XXX send an IPI to @cpu in order to guarantee a read? */
131
132 /*
133 * When we observe the tail write from cyc2ns_read_end(),
134 * the cpu must be done with that entry and its safe
135 * to start writing to it.
136 */
137 while (c2n->tail == data)
138 cpu_relax();
139
140 return data;
141}
142
143static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
144{
145 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
146
147 /*
148 * Ensure the @data writes are visible before we publish the
149 * entry. Matches the data-depencency in cyc2ns_read_begin().
150 */
151 smp_wmb();
152
153 ACCESS_ONCE(c2n->head) = data;
154}
155
156/*
157 * Accelerators for sched_clock()
158 * convert from cycles(64bits) => nanoseconds (64bits)
159 * basic equation:
160 * ns = cycles / (freq / ns_per_sec)
161 * ns = cycles * (ns_per_sec / freq)
162 * ns = cycles * (10^9 / (cpu_khz * 10^3))
163 * ns = cycles * (10^6 / cpu_khz)
164 *
165 * Then we use scaling math (suggested by george@mvista.com) to get:
166 * ns = cycles * (10^6 * SC / cpu_khz) / SC
167 * ns = cycles * cyc2ns_scale / SC
168 *
169 * And since SC is a constant power of two, we can convert the div
170 * into a shift.
171 *
172 * We can use khz divisor instead of mhz to keep a better precision, since
173 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
174 * (mathieu.desnoyers@polymtl.ca)
175 *
176 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
177 */
178
179#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
180
181static void cyc2ns_data_init(struct cyc2ns_data *data)
182{
183 data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
184 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
185 data->cyc2ns_offset = 0;
186 data->__count = 0;
187}
188
189static void cyc2ns_init(int cpu)
190{
191 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
192
193 cyc2ns_data_init(&c2n->data[0]);
194 cyc2ns_data_init(&c2n->data[1]);
195
196 c2n->head = c2n->data;
197 c2n->tail = c2n->data;
198}
199
200static inline unsigned long long cycles_2_ns(unsigned long long cyc)
201{
202 struct cyc2ns_data *data, *tail;
203 unsigned long long ns;
204
205 /*
206 * See cyc2ns_read_*() for details; replicated in order to avoid
207 * an extra few instructions that came with the abstraction.
208 * Notable, it allows us to only do the __count and tail update
209 * dance when its actually needed.
210 */
211
212 preempt_disable();
213 data = this_cpu_read(cyc2ns.head);
214 tail = this_cpu_read(cyc2ns.tail);
215
216 if (likely(data == tail)) {
217 ns = data->cyc2ns_offset;
218 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
219 } else {
220 data->__count++;
221
222 barrier();
223
224 ns = data->cyc2ns_offset;
225 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
226
227 barrier();
228
229 if (!--data->__count)
230 this_cpu_write(cyc2ns.tail, data);
231 }
232 preempt_enable();
233
234 return ns;
235}
236
237/* XXX surely we already have this someplace in the kernel?! */
238#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
239
240static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
241{
242 unsigned long long tsc_now, ns_now;
243 struct cyc2ns_data *data;
244 unsigned long flags;
245
246 local_irq_save(flags);
247 sched_clock_idle_sleep_event();
248
249 if (!cpu_khz)
250 goto done;
251
252 data = cyc2ns_write_begin(cpu);
253
254 rdtscll(tsc_now);
255 ns_now = cycles_2_ns(tsc_now);
256
257 /*
258 * Compute a new multiplier as per the above comment and ensure our
259 * time function is continuous; see the comment near struct
260 * cyc2ns_data.
261 */
262 data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
263 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
264 data->cyc2ns_offset = ns_now -
265 mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
266
267 cyc2ns_write_end(cpu, data);
268
269done:
270 sched_clock_idle_wakeup_event(0);
271 local_irq_restore(flags);
272}
41/* 273/*
42 * Scheduler clock - returns current time in nanosec units. 274 * Scheduler clock - returns current time in nanosec units.
43 */ 275 */
44u64 native_sched_clock(void) 276u64 native_sched_clock(void)
45{ 277{
46 u64 this_offset; 278 u64 tsc_now;
47 279
48 /* 280 /*
49 * Fall back to jiffies if there's no TSC available: 281 * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
53 * very important for it to be as fast as the platform 285 * very important for it to be as fast as the platform
54 * can achieve it. ) 286 * can achieve it. )
55 */ 287 */
56 if (unlikely(tsc_disabled)) { 288 if (!static_key_false(&__use_tsc)) {
57 /* No locking but a rare wrong value is not a big deal: */ 289 /* No locking but a rare wrong value is not a big deal: */
58 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 290 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
59 } 291 }
60 292
61 /* read the Time Stamp Counter: */ 293 /* read the Time Stamp Counter: */
62 rdtscll(this_offset); 294 rdtscll(tsc_now);
63 295
64 /* return the value in ns */ 296 /* return the value in ns */
65 return __cycles_2_ns(this_offset); 297 return cycles_2_ns(tsc_now);
66} 298}
67 299
68/* We need to define a real function for sched_clock, to override the 300/* We need to define a real function for sched_clock, to override the
@@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void)
589EXPORT_SYMBOL(recalibrate_cpu_khz); 821EXPORT_SYMBOL(recalibrate_cpu_khz);
590 822
591 823
592/* Accelerators for sched_clock()
593 * convert from cycles(64bits) => nanoseconds (64bits)
594 * basic equation:
595 * ns = cycles / (freq / ns_per_sec)
596 * ns = cycles * (ns_per_sec / freq)
597 * ns = cycles * (10^9 / (cpu_khz * 10^3))
598 * ns = cycles * (10^6 / cpu_khz)
599 *
600 * Then we use scaling math (suggested by george@mvista.com) to get:
601 * ns = cycles * (10^6 * SC / cpu_khz) / SC
602 * ns = cycles * cyc2ns_scale / SC
603 *
604 * And since SC is a constant power of two, we can convert the div
605 * into a shift.
606 *
607 * We can use khz divisor instead of mhz to keep a better precision, since
608 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
609 * (mathieu.desnoyers@polymtl.ca)
610 *
611 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
612 */
613
614DEFINE_PER_CPU(unsigned long, cyc2ns);
615DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
616
617static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
618{
619 unsigned long long tsc_now, ns_now, *offset;
620 unsigned long flags, *scale;
621
622 local_irq_save(flags);
623 sched_clock_idle_sleep_event();
624
625 scale = &per_cpu(cyc2ns, cpu);
626 offset = &per_cpu(cyc2ns_offset, cpu);
627
628 rdtscll(tsc_now);
629 ns_now = __cycles_2_ns(tsc_now);
630
631 if (cpu_khz) {
632 *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
633 cpu_khz / 2) / cpu_khz;
634 *offset = ns_now - mult_frac(tsc_now, *scale,
635 (1UL << CYC2NS_SCALE_FACTOR));
636 }
637
638 sched_clock_idle_wakeup_event(0);
639 local_irq_restore(flags);
640}
641
642static unsigned long long cyc2ns_suspend; 824static unsigned long long cyc2ns_suspend;
643 825
644void tsc_save_sched_clock_state(void) 826void tsc_save_sched_clock_state(void)
645{ 827{
646 if (!sched_clock_stable) 828 if (!sched_clock_stable())
647 return; 829 return;
648 830
649 cyc2ns_suspend = sched_clock(); 831 cyc2ns_suspend = sched_clock();
@@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void)
663 unsigned long flags; 845 unsigned long flags;
664 int cpu; 846 int cpu;
665 847
666 if (!sched_clock_stable) 848 if (!sched_clock_stable())
667 return; 849 return;
668 850
669 local_irq_save(flags); 851 local_irq_save(flags);
670 852
671 __this_cpu_write(cyc2ns_offset, 0); 853 /*
854 * We're comming out of suspend, there's no concurrency yet; don't
855 * bother being nice about the RCU stuff, just write to both
856 * data fields.
857 */
858
859 this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
860 this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
861
672 offset = cyc2ns_suspend - sched_clock(); 862 offset = cyc2ns_suspend - sched_clock();
673 863
674 for_each_possible_cpu(cpu) 864 for_each_possible_cpu(cpu) {
675 per_cpu(cyc2ns_offset, cpu) = offset; 865 per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
866 per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
867 }
676 868
677 local_irq_restore(flags); 869 local_irq_restore(flags);
678} 870}
@@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason)
795{ 987{
796 if (!tsc_unstable) { 988 if (!tsc_unstable) {
797 tsc_unstable = 1; 989 tsc_unstable = 1;
798 sched_clock_stable = 0; 990 clear_sched_clock_stable();
799 disable_sched_clock_irqtime(); 991 disable_sched_clock_irqtime();
800 pr_info("Marking TSC unstable due to %s\n", reason); 992 pr_info("Marking TSC unstable due to %s\n", reason);
801 /* Change only the rating, when not registered */ 993 /* Change only the rating, when not registered */
@@ -995,14 +1187,18 @@ void __init tsc_init(void)
995 * speed as the bootup CPU. (cpufreq notifiers will fix this 1187 * speed as the bootup CPU. (cpufreq notifiers will fix this
996 * up if their speed diverges) 1188 * up if their speed diverges)
997 */ 1189 */
998 for_each_possible_cpu(cpu) 1190 for_each_possible_cpu(cpu) {
1191 cyc2ns_init(cpu);
999 set_cyc2ns_scale(cpu_khz, cpu); 1192 set_cyc2ns_scale(cpu_khz, cpu);
1193 }
1000 1194
1001 if (tsc_disabled > 0) 1195 if (tsc_disabled > 0)
1002 return; 1196 return;
1003 1197
1004 /* now allow native_sched_clock() to use rdtsc */ 1198 /* now allow native_sched_clock() to use rdtsc */
1199
1005 tsc_disabled = 0; 1200 tsc_disabled = 0;
1201 static_key_slow_inc(&__use_tsc);
1006 1202
1007 if (!no_sched_irq_time) 1203 if (!no_sched_irq_time)
1008 enable_sched_clock_irqtime(); 1204 enable_sched_clock_irqtime();
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index efe4d7220397..dfe605ac1bcd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
433 return; 433 return;
434} 434}
435 435
436static inline unsigned long cycles_2_us(unsigned long long cyc) 436/*
437 * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
438 * number, not an absolute. It converts a duration in cycles to a duration in
439 * ns.
440 */
441static inline unsigned long long cycles_2_ns(unsigned long long cyc)
437{ 442{
443 struct cyc2ns_data *data = cyc2ns_read_begin();
438 unsigned long long ns; 444 unsigned long long ns;
439 unsigned long us;
440 int cpu = smp_processor_id();
441 445
442 ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; 446 ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
443 us = ns / 1000; 447
444 return us; 448 cyc2ns_read_end(data);
449 return ns;
450}
451
452/*
453 * The reverse of the above; converts a duration in ns to a duration in cycles.
454 */
455static inline unsigned long long ns_2_cycles(unsigned long long ns)
456{
457 struct cyc2ns_data *data = cyc2ns_read_begin();
458 unsigned long long cyc;
459
460 cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
461
462 cyc2ns_read_end(data);
463 return cyc;
464}
465
466static inline unsigned long cycles_2_us(unsigned long long cyc)
467{
468 return cycles_2_ns(cyc) / NSEC_PER_USEC;
469}
470
471static inline cycles_t sec_2_cycles(unsigned long sec)
472{
473 return ns_2_cycles(sec * NSEC_PER_SEC);
474}
475
476static inline unsigned long long usec_2_cycles(unsigned long usec)
477{
478 return ns_2_cycles(usec * NSEC_PER_USEC);
445} 479}
446 480
447/* 481/*
@@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
668 bcp, try); 702 bcp, try);
669} 703}
670 704
671static inline cycles_t sec_2_cycles(unsigned long sec)
672{
673 unsigned long ns;
674 cycles_t cyc;
675
676 ns = sec * 1000000000;
677 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
678 return cyc;
679}
680
681/* 705/*
682 * Our retries are blocked by all destination sw ack resources being 706 * Our retries are blocked by all destination sw ack resources being
683 * in use, and a timeout is pending. In that case hardware immediately 707 * in use, and a timeout is pending. In that case hardware immediately
@@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
1327{ 1351{
1328} 1352}
1329 1353
1330static inline unsigned long long usec_2_cycles(unsigned long microsec)
1331{
1332 unsigned long ns;
1333 unsigned long long cyc;
1334
1335 ns = microsec * 1000;
1336 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
1337 return cyc;
1338}
1339
1340/* 1354/*
1341 * Display the statistics thru /proc/sgi_uv/ptc_statistics 1355 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1342 * 'data' points to the cpu number 1356 * 'data' points to the cpu number
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb8380a1c..96bc506ac6de 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358349 i386 kcmp sys_kcmp 358349 i386 kcmp sys_kcmp
359350 i386 finit_module sys_finit_module 359350 i386 finit_module sys_finit_module
360351 i386 sched_setattr sys_sched_setattr
361352 i386 sched_getattr sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65dfd14f..a12bddc7ccea 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,8 @@
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 common kcmp sys_kcmp 321312 common kcmp sys_kcmp
322313 common finit_module sys_finit_module 322313 common finit_module sys_finit_module
323314 common sched_setattr sys_sched_setattr
324315 common sched_getattr sys_sched_getattr
323 325
324# 326#
325# x32-specific system call numbers start at 512 to avoid cache impact 327# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index fc6008fbce35..509452a62f96 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -193,10 +193,7 @@ static int power_saving_thread(void *data)
193 CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 193 CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
194 stop_critical_timings(); 194 stop_critical_timings();
195 195
196 __monitor((void *)&current_thread_info()->flags, 0, 0); 196 mwait_idle_with_hints(power_saving_mwait_eax, 1);
197 smp_mb();
198 if (!need_resched())
199 __mwait(power_saving_mwait_eax, 1);
200 197
201 start_critical_timings(); 198 start_critical_timings();
202 if (lapic_marked_unstable) 199 if (lapic_marked_unstable)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 644516d9bde6..f90c56c8379e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -727,11 +727,6 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
727 if (unlikely(!pr)) 727 if (unlikely(!pr))
728 return -EINVAL; 728 return -EINVAL;
729 729
730 if (cx->entry_method == ACPI_CSTATE_FFH) {
731 if (current_set_polling_and_test())
732 return -EINVAL;
733 }
734
735 lapic_timer_state_broadcast(pr, cx, 1); 730 lapic_timer_state_broadcast(pr, cx, 1);
736 acpi_idle_do_entry(cx); 731 acpi_idle_do_entry(cx);
737 732
@@ -785,11 +780,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
785 if (unlikely(!pr)) 780 if (unlikely(!pr))
786 return -EINVAL; 781 return -EINVAL;
787 782
788 if (cx->entry_method == ACPI_CSTATE_FFH) {
789 if (current_set_polling_and_test())
790 return -EINVAL;
791 }
792
793 /* 783 /*
794 * Must be done before busmaster disable as we might need to 784 * Must be done before busmaster disable as we might need to
795 * access HPET ! 785 * access HPET !
@@ -841,11 +831,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
841 } 831 }
842 } 832 }
843 833
844 if (cx->entry_method == ACPI_CSTATE_FFH) {
845 if (current_set_polling_and_test())
846 return -EINVAL;
847 }
848
849 acpi_unlazy_tlb(smp_processor_id()); 834 acpi_unlazy_tlb(smp_processor_id());
850 835
851 /* Tell the scheduler that we are going deep-idle: */ 836 /* Tell the scheduler that we are going deep-idle: */
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 797ed29a36ea..6c0e0452dd9b 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -377,16 +377,7 @@ static int intel_idle(struct cpuidle_device *dev,
377 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 377 if (!(lapic_timer_reliable_states & (1 << (cstate))))
378 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 378 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
379 379
380 if (!current_set_polling_and_test()) { 380 mwait_idle_with_hints(eax, ecx);
381
382 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
383 clflush((void *)&current_thread_info()->flags);
384
385 __monitor((void *)&current_thread_info()->flags, 0, 0);
386 smp_mb();
387 if (!need_resched())
388 __mwait(eax, ecx);
389 }
390 381
391 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 382 if (!(lapic_timer_reliable_states & (1 << (cstate))))
392 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 383 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 8f181b3f842b..d833c8f5b465 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -438,14 +438,12 @@ static int clamp_thread(void *arg)
438 */ 438 */
439 local_touch_nmi(); 439 local_touch_nmi();
440 stop_critical_timings(); 440 stop_critical_timings();
441 __monitor((void *)&current_thread_info()->flags, 0, 0); 441 mwait_idle_with_hints(eax, ecx);
442 cpu_relax(); /* allow HT sibling to run */
443 __mwait(eax, ecx);
444 start_critical_timings(); 442 start_critical_timings();
445 atomic_inc(&idle_wakeup_counter); 443 atomic_inc(&idle_wakeup_counter);
446 } 444 }
447 tick_nohz_idle_exit(); 445 tick_nohz_idle_exit();
448 preempt_enable_no_resched(); 446 preempt_enable();
449 } 447 }
450 del_timer_sync(&wakeup_timer); 448 del_timer_sync(&wakeup_timer);
451 clear_bit(cpunr, cpu_clamping_mask); 449 clear_bit(cpunr, cpu_clamping_mask);
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 27b1bcffe408..86c12c93e3cf 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -1,9 +1,35 @@
1#ifndef _LINUX_BH_H 1#ifndef _LINUX_BH_H
2#define _LINUX_BH_H 2#define _LINUX_BH_H
3 3
4extern void local_bh_disable(void); 4#include <linux/preempt.h>
5#include <linux/preempt_mask.h>
6
7#ifdef CONFIG_TRACE_IRQFLAGS
8extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
9#else
10static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11{
12 preempt_count_add(cnt);
13 barrier();
14}
15#endif
16
17static inline void local_bh_disable(void)
18{
19 __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
20}
21
5extern void _local_bh_enable(void); 22extern void _local_bh_enable(void);
6extern void local_bh_enable(void); 23extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);
7extern void local_bh_enable_ip(unsigned long ip); 24
25static inline void local_bh_enable_ip(unsigned long ip)
26{
27 __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
28}
29
30static inline void local_bh_enable(void)
31{
32 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
33}
8 34
9#endif /* _LINUX_BH_H */ 35#endif /* _LINUX_BH_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d9cf963ac832..12d5f972f23f 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -5,6 +5,7 @@
5#include <linux/lockdep.h> 5#include <linux/lockdep.h>
6#include <linux/ftrace_irq.h> 6#include <linux/ftrace_irq.h>
7#include <linux/vtime.h> 7#include <linux/vtime.h>
8#include <asm/hardirq.h>
8 9
9 10
10extern void synchronize_irq(unsigned int irq); 11extern void synchronize_irq(unsigned int irq);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b0ed422e4e4a..f0e52383a001 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -11,6 +11,7 @@
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/securebits.h> 12#include <linux/securebits.h>
13#include <linux/seqlock.h> 13#include <linux/seqlock.h>
14#include <linux/rbtree.h>
14#include <net/net_namespace.h> 15#include <net/net_namespace.h>
15#include <linux/sched/rt.h> 16#include <linux/sched/rt.h>
16 17
@@ -154,6 +155,14 @@ extern struct task_group root_task_group;
154 155
155#define INIT_TASK_COMM "swapper" 156#define INIT_TASK_COMM "swapper"
156 157
158#ifdef CONFIG_RT_MUTEXES
159# define INIT_RT_MUTEXES(tsk) \
160 .pi_waiters = RB_ROOT, \
161 .pi_waiters_leftmost = NULL,
162#else
163# define INIT_RT_MUTEXES(tsk)
164#endif
165
157/* 166/*
158 * INIT_TASK is used to set up the first task table, touch at 167 * INIT_TASK is used to set up the first task table, touch at
159 * your own risk!. Base=0, limit=0x1fffff (=2MB) 168 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -221,6 +230,7 @@ extern struct task_group root_task_group;
221 INIT_TRACE_RECURSION \ 230 INIT_TRACE_RECURSION \
222 INIT_TASK_RCU_PREEMPT(tsk) \ 231 INIT_TASK_RCU_PREEMPT(tsk) \
223 INIT_CPUSET_SEQ(tsk) \ 232 INIT_CPUSET_SEQ(tsk) \
233 INIT_RT_MUTEXES(tsk) \
224 INIT_VTIME(tsk) \ 234 INIT_VTIME(tsk) \
225} 235}
226 236
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index a3d9dc8c2c00..59749fc48328 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -64,7 +64,11 @@ do { \
64} while (0) 64} while (0)
65 65
66#else 66#else
67#define preempt_enable() preempt_enable_no_resched() 67#define preempt_enable() \
68do { \
69 barrier(); \
70 preempt_count_dec(); \
71} while (0)
68#define preempt_check_resched() do { } while (0) 72#define preempt_check_resched() do { } while (0)
69#endif 73#endif
70 74
@@ -93,7 +97,11 @@ do { \
93 __preempt_schedule_context(); \ 97 __preempt_schedule_context(); \
94} while (0) 98} while (0)
95#else 99#else
96#define preempt_enable_notrace() preempt_enable_no_resched_notrace() 100#define preempt_enable_notrace() \
101do { \
102 barrier(); \
103 __preempt_count_dec(); \
104} while (0)
97#endif 105#endif
98 106
99#else /* !CONFIG_PREEMPT_COUNT */ 107#else /* !CONFIG_PREEMPT_COUNT */
@@ -116,6 +124,31 @@ do { \
116 124
117#endif /* CONFIG_PREEMPT_COUNT */ 125#endif /* CONFIG_PREEMPT_COUNT */
118 126
127#ifdef MODULE
128/*
129 * Modules have no business playing preemption tricks.
130 */
131#undef sched_preempt_enable_no_resched
132#undef preempt_enable_no_resched
133#undef preempt_enable_no_resched_notrace
134#undef preempt_check_resched
135#endif
136
137#ifdef CONFIG_PREEMPT
138#define preempt_set_need_resched() \
139do { \
140 set_preempt_need_resched(); \
141} while (0)
142#define preempt_fold_need_resched() \
143do { \
144 if (tif_need_resched()) \
145 set_preempt_need_resched(); \
146} while (0)
147#else
148#define preempt_set_need_resched() do { } while (0)
149#define preempt_fold_need_resched() do { } while (0)
150#endif
151
119#ifdef CONFIG_PREEMPT_NOTIFIERS 152#ifdef CONFIG_PREEMPT_NOTIFIERS
120 153
121struct preempt_notifier; 154struct preempt_notifier;
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
index d169820203dd..dbeec4d4a3be 100644
--- a/include/linux/preempt_mask.h
+++ b/include/linux/preempt_mask.h
@@ -2,7 +2,6 @@
2#define LINUX_PREEMPT_MASK_H 2#define LINUX_PREEMPT_MASK_H
3 3
4#include <linux/preempt.h> 4#include <linux/preempt.h>
5#include <asm/hardirq.h>
6 5
7/* 6/*
8 * We put the hardirq and softirq counter into the preemption 7 * We put the hardirq and softirq counter into the preemption
@@ -79,6 +78,21 @@
79#endif 78#endif
80 79
81/* 80/*
81 * The preempt_count offset needed for things like:
82 *
83 * spin_lock_bh()
84 *
85 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
86 * softirqs, such that unlock sequences of:
87 *
88 * spin_unlock();
89 * local_bh_enable();
90 *
91 * Work as expected.
92 */
93#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
94
95/*
82 * Are we running in atomic context? WARNING: this macro cannot 96 * Are we running in atomic context? WARNING: this macro cannot
83 * always detect atomic context; in particular, it cannot know about 97 * always detect atomic context; in particular, it cannot know about
84 * held spinlocks in non-preemptible kernels. Thus it should not be 98 * held spinlocks in non-preemptible kernels. Thus it should not be
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index de17134244f3..3aed8d737e1a 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -13,7 +13,7 @@
13#define __LINUX_RT_MUTEX_H 13#define __LINUX_RT_MUTEX_H
14 14
15#include <linux/linkage.h> 15#include <linux/linkage.h>
16#include <linux/plist.h> 16#include <linux/rbtree.h>
17#include <linux/spinlock_types.h> 17#include <linux/spinlock_types.h>
18 18
19extern int max_lock_depth; /* for sysctl */ 19extern int max_lock_depth; /* for sysctl */
@@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */
22 * The rt_mutex structure 22 * The rt_mutex structure
23 * 23 *
24 * @wait_lock: spinlock to protect the structure 24 * @wait_lock: spinlock to protect the structure
25 * @wait_list: pilist head to enqueue waiters in priority order 25 * @waiters: rbtree root to enqueue waiters in priority order
26 * @waiters_leftmost: top waiter
26 * @owner: the mutex owner 27 * @owner: the mutex owner
27 */ 28 */
28struct rt_mutex { 29struct rt_mutex {
29 raw_spinlock_t wait_lock; 30 raw_spinlock_t wait_lock;
30 struct plist_head wait_list; 31 struct rb_root waiters;
32 struct rb_node *waiters_leftmost;
31 struct task_struct *owner; 33 struct task_struct *owner;
32#ifdef CONFIG_DEBUG_RT_MUTEXES 34#ifdef CONFIG_DEBUG_RT_MUTEXES
33 int save_state; 35 int save_state;
@@ -66,7 +68,7 @@ struct hrtimer_sleeper;
66 68
67#define __RT_MUTEX_INITIALIZER(mutexname) \ 69#define __RT_MUTEX_INITIALIZER(mutexname) \
68 { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ 70 { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
69 , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ 71 , .waiters = RB_ROOT \
70 , .owner = NULL \ 72 , .owner = NULL \
71 __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} 73 __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
72 74
@@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock);
98 100
99extern void rt_mutex_unlock(struct rt_mutex *lock); 101extern void rt_mutex_unlock(struct rt_mutex *lock);
100 102
101#ifdef CONFIG_RT_MUTEXES
102# define INIT_RT_MUTEXES(tsk) \
103 .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \
104 INIT_RT_MUTEX_DEBUG(tsk)
105#else
106# define INIT_RT_MUTEXES(tsk)
107#endif
108
109#endif 103#endif
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index 9c9f0495d37c..5b9b84b20407 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -172,8 +172,7 @@ static inline void __raw_read_lock_irq(rwlock_t *lock)
172 172
173static inline void __raw_read_lock_bh(rwlock_t *lock) 173static inline void __raw_read_lock_bh(rwlock_t *lock)
174{ 174{
175 local_bh_disable(); 175 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
176 preempt_disable();
177 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 176 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
178 LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); 177 LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
179} 178}
@@ -200,8 +199,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock)
200 199
201static inline void __raw_write_lock_bh(rwlock_t *lock) 200static inline void __raw_write_lock_bh(rwlock_t *lock)
202{ 201{
203 local_bh_disable(); 202 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
204 preempt_disable();
205 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 203 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
206 LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); 204 LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
207} 205}
@@ -250,8 +248,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock)
250{ 248{
251 rwlock_release(&lock->dep_map, 1, _RET_IP_); 249 rwlock_release(&lock->dep_map, 1, _RET_IP_);
252 do_raw_read_unlock(lock); 250 do_raw_read_unlock(lock);
253 preempt_enable_no_resched(); 251 __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
254 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
255} 252}
256 253
257static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, 254static inline void __raw_write_unlock_irqrestore(rwlock_t *lock,
@@ -275,8 +272,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock)
275{ 272{
276 rwlock_release(&lock->dep_map, 1, _RET_IP_); 273 rwlock_release(&lock->dep_map, 1, _RET_IP_);
277 do_raw_write_unlock(lock); 274 do_raw_write_unlock(lock);
278 preempt_enable_no_resched(); 275 __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
279 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
280} 276}
281 277
282#endif /* __LINUX_RWLOCK_API_SMP_H */ 278#endif /* __LINUX_RWLOCK_API_SMP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 53f97eb8dbc7..ffccdad050b5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,6 +16,7 @@ struct sched_param {
16#include <linux/types.h> 16#include <linux/types.h>
17#include <linux/timex.h> 17#include <linux/timex.h>
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/plist.h>
19#include <linux/rbtree.h> 20#include <linux/rbtree.h>
20#include <linux/thread_info.h> 21#include <linux/thread_info.h>
21#include <linux/cpumask.h> 22#include <linux/cpumask.h>
@@ -56,6 +57,70 @@ struct sched_param {
56 57
57#include <asm/processor.h> 58#include <asm/processor.h>
58 59
60#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
61
62/*
63 * Extended scheduling parameters data structure.
64 *
65 * This is needed because the original struct sched_param can not be
66 * altered without introducing ABI issues with legacy applications
67 * (e.g., in sched_getparam()).
68 *
69 * However, the possibility of specifying more than just a priority for
70 * the tasks may be useful for a wide variety of application fields, e.g.,
71 * multimedia, streaming, automation and control, and many others.
72 *
73 * This variant (sched_attr) is meant at describing a so-called
74 * sporadic time-constrained task. In such model a task is specified by:
75 * - the activation period or minimum instance inter-arrival time;
76 * - the maximum (or average, depending on the actual scheduling
77 * discipline) computation time of all instances, a.k.a. runtime;
78 * - the deadline (relative to the actual activation time) of each
79 * instance.
80 * Very briefly, a periodic (sporadic) task asks for the execution of
81 * some specific computation --which is typically called an instance--
82 * (at most) every period. Moreover, each instance typically lasts no more
83 * than the runtime and must be completed by time instant t equal to
84 * the instance activation time + the deadline.
85 *
86 * This is reflected by the actual fields of the sched_attr structure:
87 *
88 * @size size of the structure, for fwd/bwd compat.
89 *
90 * @sched_policy task's scheduling policy
91 * @sched_flags for customizing the scheduler behaviour
92 * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
93 * @sched_priority task's static priority (SCHED_FIFO/RR)
94 * @sched_deadline representative of the task's deadline
95 * @sched_runtime representative of the task's runtime
96 * @sched_period representative of the task's period
97 *
98 * Given this task model, there are a multiplicity of scheduling algorithms
99 * and policies, that can be used to ensure all the tasks will make their
100 * timing constraints.
101 *
102 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
103 * only user of this new interface. More information about the algorithm
104 * available in the scheduling class file or in Documentation/.
105 */
106struct sched_attr {
107 u32 size;
108
109 u32 sched_policy;
110 u64 sched_flags;
111
112 /* SCHED_NORMAL, SCHED_BATCH */
113 s32 sched_nice;
114
115 /* SCHED_FIFO, SCHED_RR */
116 u32 sched_priority;
117
118 /* SCHED_DEADLINE */
119 u64 sched_runtime;
120 u64 sched_deadline;
121 u64 sched_period;
122};
123
59struct exec_domain; 124struct exec_domain;
60struct futex_pi_state; 125struct futex_pi_state;
61struct robust_list_head; 126struct robust_list_head;
@@ -168,7 +233,6 @@ extern char ___assert_task_state[1 - 2*!!(
168 233
169#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) 234#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
170#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) 235#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
171#define task_is_dead(task) ((task)->exit_state != 0)
172#define task_is_stopped_or_traced(task) \ 236#define task_is_stopped_or_traced(task) \
173 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 237 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
174#define task_contributes_to_load(task) \ 238#define task_contributes_to_load(task) \
@@ -1029,6 +1093,51 @@ struct sched_rt_entity {
1029#endif 1093#endif
1030}; 1094};
1031 1095
1096struct sched_dl_entity {
1097 struct rb_node rb_node;
1098
1099 /*
1100 * Original scheduling parameters. Copied here from sched_attr
1101 * during sched_setscheduler2(), they will remain the same until
1102 * the next sched_setscheduler2().
1103 */
1104 u64 dl_runtime; /* maximum runtime for each instance */
1105 u64 dl_deadline; /* relative deadline of each instance */
1106 u64 dl_period; /* separation of two instances (period) */
1107 u64 dl_bw; /* dl_runtime / dl_deadline */
1108
1109 /*
1110 * Actual scheduling parameters. Initialized with the values above,
1111 * they are continously updated during task execution. Note that
1112 * the remaining runtime could be < 0 in case we are in overrun.
1113 */
1114 s64 runtime; /* remaining runtime for this instance */
1115 u64 deadline; /* absolute deadline for this instance */
1116 unsigned int flags; /* specifying the scheduler behaviour */
1117
1118 /*
1119 * Some bool flags:
1120 *
1121 * @dl_throttled tells if we exhausted the runtime. If so, the
1122 * task has to wait for a replenishment to be performed at the
1123 * next firing of dl_timer.
1124 *
1125 * @dl_new tells if a new instance arrived. If so we must
1126 * start executing it with full runtime and reset its absolute
1127 * deadline;
1128 *
1129 * @dl_boosted tells if we are boosted due to DI. If so we are
1130 * outside bandwidth enforcement mechanism (but only until we
1131 * exit the critical section).
1132 */
1133 int dl_throttled, dl_new, dl_boosted;
1134
1135 /*
1136 * Bandwidth enforcement timer. Each -deadline task has its
1137 * own bandwidth to be enforced, thus we need one timer per task.
1138 */
1139 struct hrtimer dl_timer;
1140};
1032 1141
1033struct rcu_node; 1142struct rcu_node;
1034 1143
@@ -1065,6 +1174,7 @@ struct task_struct {
1065#ifdef CONFIG_CGROUP_SCHED 1174#ifdef CONFIG_CGROUP_SCHED
1066 struct task_group *sched_task_group; 1175 struct task_group *sched_task_group;
1067#endif 1176#endif
1177 struct sched_dl_entity dl;
1068 1178
1069#ifdef CONFIG_PREEMPT_NOTIFIERS 1179#ifdef CONFIG_PREEMPT_NOTIFIERS
1070 /* list of struct preempt_notifier: */ 1180 /* list of struct preempt_notifier: */
@@ -1098,6 +1208,7 @@ struct task_struct {
1098 struct list_head tasks; 1208 struct list_head tasks;
1099#ifdef CONFIG_SMP 1209#ifdef CONFIG_SMP
1100 struct plist_node pushable_tasks; 1210 struct plist_node pushable_tasks;
1211 struct rb_node pushable_dl_tasks;
1101#endif 1212#endif
1102 1213
1103 struct mm_struct *mm, *active_mm; 1214 struct mm_struct *mm, *active_mm;
@@ -1249,9 +1360,12 @@ struct task_struct {
1249 1360
1250#ifdef CONFIG_RT_MUTEXES 1361#ifdef CONFIG_RT_MUTEXES
1251 /* PI waiters blocked on a rt_mutex held by this task */ 1362 /* PI waiters blocked on a rt_mutex held by this task */
1252 struct plist_head pi_waiters; 1363 struct rb_root pi_waiters;
1364 struct rb_node *pi_waiters_leftmost;
1253 /* Deadlock detection and priority inheritance handling */ 1365 /* Deadlock detection and priority inheritance handling */
1254 struct rt_mutex_waiter *pi_blocked_on; 1366 struct rt_mutex_waiter *pi_blocked_on;
1367 /* Top pi_waiters task */
1368 struct task_struct *pi_top_task;
1255#endif 1369#endif
1256 1370
1257#ifdef CONFIG_DEBUG_MUTEXES 1371#ifdef CONFIG_DEBUG_MUTEXES
@@ -1880,7 +1994,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
1880 * but then during bootup it turns out that sched_clock() 1994 * but then during bootup it turns out that sched_clock()
1881 * is reliable after all: 1995 * is reliable after all:
1882 */ 1996 */
1883extern int sched_clock_stable; 1997extern int sched_clock_stable(void);
1998extern void set_sched_clock_stable(void);
1999extern void clear_sched_clock_stable(void);
1884 2000
1885extern void sched_clock_tick(void); 2001extern void sched_clock_tick(void);
1886extern void sched_clock_idle_sleep_event(void); 2002extern void sched_clock_idle_sleep_event(void);
@@ -1959,6 +2075,8 @@ extern int sched_setscheduler(struct task_struct *, int,
1959 const struct sched_param *); 2075 const struct sched_param *);
1960extern int sched_setscheduler_nocheck(struct task_struct *, int, 2076extern int sched_setscheduler_nocheck(struct task_struct *, int,
1961 const struct sched_param *); 2077 const struct sched_param *);
2078extern int sched_setattr(struct task_struct *,
2079 const struct sched_attr *);
1962extern struct task_struct *idle_task(int cpu); 2080extern struct task_struct *idle_task(int cpu);
1963/** 2081/**
1964 * is_idle_task - is the specified task an idle task? 2082 * is_idle_task - is the specified task an idle task?
@@ -2038,7 +2156,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
2038#else 2156#else
2039 static inline void kick_process(struct task_struct *tsk) { } 2157 static inline void kick_process(struct task_struct *tsk) { }
2040#endif 2158#endif
2041extern void sched_fork(unsigned long clone_flags, struct task_struct *p); 2159extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
2042extern void sched_dead(struct task_struct *p); 2160extern void sched_dead(struct task_struct *p);
2043 2161
2044extern void proc_caches_init(void); 2162extern void proc_caches_init(void);
@@ -2627,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void)
2627} 2745}
2628#endif 2746#endif
2629 2747
2748static inline void current_clr_polling(void)
2749{
2750 __current_clr_polling();
2751
2752 /*
2753 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
2754 * Once the bit is cleared, we'll get IPIs with every new
2755 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
2756 * fold.
2757 */
2758 smp_mb(); /* paired with resched_task() */
2759
2760 preempt_fold_need_resched();
2761}
2762
2630static __always_inline bool need_resched(void) 2763static __always_inline bool need_resched(void)
2631{ 2764{
2632 return unlikely(tif_need_resched()); 2765 return unlikely(tif_need_resched());
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
new file mode 100644
index 000000000000..9d303b8847df
--- /dev/null
+++ b/include/linux/sched/deadline.h
@@ -0,0 +1,24 @@
1#ifndef _SCHED_DEADLINE_H
2#define _SCHED_DEADLINE_H
3
4/*
5 * SCHED_DEADLINE tasks has negative priorities, reflecting
6 * the fact that any of them has higher prio than RT and
7 * NORMAL/BATCH tasks.
8 */
9
10#define MAX_DL_PRIO 0
11
12static inline int dl_prio(int prio)
13{
14 if (unlikely(prio < MAX_DL_PRIO))
15 return 1;
16 return 0;
17}
18
19static inline int dl_task(struct task_struct *p)
20{
21 return dl_prio(p->prio);
22}
23
24#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 440434df3627..34e4ebea8fce 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -35,6 +35,7 @@ static inline int rt_task(struct task_struct *p)
35#ifdef CONFIG_RT_MUTEXES 35#ifdef CONFIG_RT_MUTEXES
36extern int rt_mutex_getprio(struct task_struct *p); 36extern int rt_mutex_getprio(struct task_struct *p);
37extern void rt_mutex_setprio(struct task_struct *p, int prio); 37extern void rt_mutex_setprio(struct task_struct *p, int prio);
38extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
38extern void rt_mutex_adjust_pi(struct task_struct *p); 39extern void rt_mutex_adjust_pi(struct task_struct *p);
39static inline bool tsk_is_pi_blocked(struct task_struct *tsk) 40static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
40{ 41{
@@ -45,6 +46,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
45{ 46{
46 return p->normal_prio; 47 return p->normal_prio;
47} 48}
49static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
50{
51 return NULL;
52}
48# define rt_mutex_adjust_pi(p) do { } while (0) 53# define rt_mutex_adjust_pi(p) do { } while (0)
49static inline bool tsk_is_pi_blocked(struct task_struct *tsk) 54static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
50{ 55{
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 41467f8ff8ec..31e0193cb0c5 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,7 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
48extern unsigned int sysctl_numa_balancing_scan_period_min; 48extern unsigned int sysctl_numa_balancing_scan_period_min;
49extern unsigned int sysctl_numa_balancing_scan_period_max; 49extern unsigned int sysctl_numa_balancing_scan_period_max;
50extern unsigned int sysctl_numa_balancing_scan_size; 50extern unsigned int sysctl_numa_balancing_scan_size;
51extern unsigned int sysctl_numa_balancing_settle_count;
52 51
53#ifdef CONFIG_SCHED_DEBUG 52#ifdef CONFIG_SCHED_DEBUG
54extern unsigned int sysctl_sched_migration_cost; 53extern unsigned int sysctl_sched_migration_cost;
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index bdb9993f0fda..42dfab89e740 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -131,8 +131,7 @@ static inline void __raw_spin_lock_irq(raw_spinlock_t *lock)
131 131
132static inline void __raw_spin_lock_bh(raw_spinlock_t *lock) 132static inline void __raw_spin_lock_bh(raw_spinlock_t *lock)
133{ 133{
134 local_bh_disable(); 134 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
135 preempt_disable();
136 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 135 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
137 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); 136 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
138} 137}
@@ -174,20 +173,17 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock)
174{ 173{
175 spin_release(&lock->dep_map, 1, _RET_IP_); 174 spin_release(&lock->dep_map, 1, _RET_IP_);
176 do_raw_spin_unlock(lock); 175 do_raw_spin_unlock(lock);
177 preempt_enable_no_resched(); 176 __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
178 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
179} 177}
180 178
181static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) 179static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
182{ 180{
183 local_bh_disable(); 181 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
184 preempt_disable();
185 if (do_raw_spin_trylock(lock)) { 182 if (do_raw_spin_trylock(lock)) {
186 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); 183 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
187 return 1; 184 return 1;
188 } 185 }
189 preempt_enable_no_resched(); 186 __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
190 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
191 return 0; 187 return 0;
192} 188}
193 189
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index af1f47229e70..d0d188861ad6 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -24,11 +24,14 @@
24 * flags straight, to suppress compiler warnings of unused lock 24 * flags straight, to suppress compiler warnings of unused lock
25 * variables, and to add the proper checker annotations: 25 * variables, and to add the proper checker annotations:
26 */ 26 */
27#define ___LOCK(lock) \
28 do { __acquire(lock); (void)(lock); } while (0)
29
27#define __LOCK(lock) \ 30#define __LOCK(lock) \
28 do { preempt_disable(); __acquire(lock); (void)(lock); } while (0) 31 do { preempt_disable(); ___LOCK(lock); } while (0)
29 32
30#define __LOCK_BH(lock) \ 33#define __LOCK_BH(lock) \
31 do { local_bh_disable(); __LOCK(lock); } while (0) 34 do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0)
32 35
33#define __LOCK_IRQ(lock) \ 36#define __LOCK_IRQ(lock) \
34 do { local_irq_disable(); __LOCK(lock); } while (0) 37 do { local_irq_disable(); __LOCK(lock); } while (0)
@@ -36,12 +39,15 @@
36#define __LOCK_IRQSAVE(lock, flags) \ 39#define __LOCK_IRQSAVE(lock, flags) \
37 do { local_irq_save(flags); __LOCK(lock); } while (0) 40 do { local_irq_save(flags); __LOCK(lock); } while (0)
38 41
42#define ___UNLOCK(lock) \
43 do { __release(lock); (void)(lock); } while (0)
44
39#define __UNLOCK(lock) \ 45#define __UNLOCK(lock) \
40 do { preempt_enable(); __release(lock); (void)(lock); } while (0) 46 do { preempt_enable(); ___UNLOCK(lock); } while (0)
41 47
42#define __UNLOCK_BH(lock) \ 48#define __UNLOCK_BH(lock) \
43 do { preempt_enable_no_resched(); local_bh_enable(); \ 49 do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \
44 __release(lock); (void)(lock); } while (0) 50 ___UNLOCK(lock); } while (0)
45 51
46#define __UNLOCK_IRQ(lock) \ 52#define __UNLOCK_IRQ(lock) \
47 do { local_irq_enable(); __UNLOCK(lock); } while (0) 53 do { local_irq_enable(); __UNLOCK(lock); } while (0)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 94273bbe6050..40ed9e9a77e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -38,6 +38,7 @@ struct rlimit;
38struct rlimit64; 38struct rlimit64;
39struct rusage; 39struct rusage;
40struct sched_param; 40struct sched_param;
41struct sched_attr;
41struct sel_arg_struct; 42struct sel_arg_struct;
42struct semaphore; 43struct semaphore;
43struct sembuf; 44struct sembuf;
@@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
279 struct sched_param __user *param); 280 struct sched_param __user *param);
280asmlinkage long sys_sched_setparam(pid_t pid, 281asmlinkage long sys_sched_setparam(pid_t pid,
281 struct sched_param __user *param); 282 struct sched_param __user *param);
283asmlinkage long sys_sched_setattr(pid_t pid,
284 struct sched_attr __user *attr);
282asmlinkage long sys_sched_getscheduler(pid_t pid); 285asmlinkage long sys_sched_getscheduler(pid_t pid);
283asmlinkage long sys_sched_getparam(pid_t pid, 286asmlinkage long sys_sched_getparam(pid_t pid,
284 struct sched_param __user *param); 287 struct sched_param __user *param);
288asmlinkage long sys_sched_getattr(pid_t pid,
289 struct sched_attr __user *attr,
290 unsigned int size);
285asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 291asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
286 unsigned long __user *user_mask_ptr); 292 unsigned long __user *user_mask_ptr);
287asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 293asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9d8cf056e661..ecd3319dac33 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -25,13 +25,16 @@ static inline void pagefault_disable(void)
25 25
26static inline void pagefault_enable(void) 26static inline void pagefault_enable(void)
27{ 27{
28#ifndef CONFIG_PREEMPT
28 /* 29 /*
29 * make sure to issue those last loads/stores before enabling 30 * make sure to issue those last loads/stores before enabling
30 * the pagefault handler again. 31 * the pagefault handler again.
31 */ 32 */
32 barrier(); 33 barrier();
33 preempt_count_dec(); 34 preempt_count_dec();
34 preempt_check_resched(); 35#else
36 preempt_enable();
37#endif
35} 38}
36 39
37#ifndef ARCH_HAS_NOCACHE_UACCESS 40#ifndef ARCH_HAS_NOCACHE_UACCESS
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 829627d7b846..1d67fb6b23a0 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -42,27 +42,10 @@ static inline bool net_busy_loop_on(void)
42 return sysctl_net_busy_poll; 42 return sysctl_net_busy_poll;
43} 43}
44 44
45/* a wrapper to make debug_smp_processor_id() happy
46 * we can use sched_clock() because we don't care much about precision
47 * we only care that the average is bounded
48 */
49#ifdef CONFIG_DEBUG_PREEMPT
50static inline u64 busy_loop_us_clock(void)
51{
52 u64 rc;
53
54 preempt_disable_notrace();
55 rc = sched_clock();
56 preempt_enable_no_resched_notrace();
57
58 return rc >> 10;
59}
60#else /* CONFIG_DEBUG_PREEMPT */
61static inline u64 busy_loop_us_clock(void) 45static inline u64 busy_loop_us_clock(void)
62{ 46{
63 return sched_clock() >> 10; 47 return local_clock() >> 10;
64} 48}
65#endif /* CONFIG_DEBUG_PREEMPT */
66 49
67static inline unsigned long sk_busy_loop_end_time(struct sock *sk) 50static inline unsigned long sk_busy_loop_end_time(struct sock *sk)
68{ 51{
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5a0f945927ac..34f9d7387d13 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -39,8 +39,14 @@
39#define SCHED_BATCH 3 39#define SCHED_BATCH 3
40/* SCHED_ISO: reserved but not implemented yet */ 40/* SCHED_ISO: reserved but not implemented yet */
41#define SCHED_IDLE 5 41#define SCHED_IDLE 5
42#define SCHED_DEADLINE 6
43
42/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ 44/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
43#define SCHED_RESET_ON_FORK 0x40000000 45#define SCHED_RESET_ON_FORK 0x40000000
44 46
47/*
48 * For the sched_{set,get}attr() calls
49 */
50#define SCHED_FLAG_RESET_ON_FORK 0x01
45 51
46#endif /* _UAPI_LINUX_SCHED_H */ 52#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 988573a9a387..277f494c2a9a 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -105,14 +105,17 @@ static void cpu_idle_loop(void)
105 __current_set_polling(); 105 __current_set_polling();
106 } 106 }
107 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
115 } 108 }
109
110 /*
111 * Since we fell out of the loop above, we know
112 * TIF_NEED_RESCHED must be set, propagate it into
113 * PREEMPT_NEED_RESCHED.
114 *
115 * This is required because for polling idle loops we will
116 * not have had an IPI to fold the state for us.
117 */
118 preempt_set_need_resched();
116 tick_nohz_idle_exit(); 119 tick_nohz_idle_exit();
117 schedule_preempt_disabled(); 120 schedule_preempt_disabled();
118 } 121 }
diff --git a/kernel/fork.c b/kernel/fork.c
index dfa736c98d17..294189fc7ac8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1087,8 +1087,10 @@ static void rt_mutex_init_task(struct task_struct *p)
1087{ 1087{
1088 raw_spin_lock_init(&p->pi_lock); 1088 raw_spin_lock_init(&p->pi_lock);
1089#ifdef CONFIG_RT_MUTEXES 1089#ifdef CONFIG_RT_MUTEXES
1090 plist_head_init(&p->pi_waiters); 1090 p->pi_waiters = RB_ROOT;
1091 p->pi_waiters_leftmost = NULL;
1091 p->pi_blocked_on = NULL; 1092 p->pi_blocked_on = NULL;
1093 p->pi_top_task = NULL;
1092#endif 1094#endif
1093} 1095}
1094 1096
@@ -1311,7 +1313,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1311#endif 1313#endif
1312 1314
1313 /* Perform scheduler related setup. Assign this task to a CPU. */ 1315 /* Perform scheduler related setup. Assign this task to a CPU. */
1314 sched_fork(clone_flags, p); 1316 retval = sched_fork(clone_flags, p);
1317 if (retval)
1318 goto bad_fork_cleanup_policy;
1315 1319
1316 retval = perf_event_init_task(p); 1320 retval = perf_event_init_task(p);
1317 if (retval) 1321 if (retval)
@@ -1403,13 +1407,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1403 p->tgid = p->pid; 1407 p->tgid = p->pid;
1404 } 1408 }
1405 1409
1406 p->pdeath_signal = 0;
1407 p->exit_state = 0;
1408
1409 p->nr_dirtied = 0; 1410 p->nr_dirtied = 0;
1410 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1411 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1411 p->dirty_paused_when = 0; 1412 p->dirty_paused_when = 0;
1412 1413
1414 p->pdeath_signal = 0;
1413 INIT_LIST_HEAD(&p->thread_group); 1415 INIT_LIST_HEAD(&p->thread_group);
1414 p->task_works = NULL; 1416 p->task_works = NULL;
1415 1417
diff --git a/kernel/futex.c b/kernel/futex.c
index 1ddc4498f1e1..44a1261cb9ff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2426,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2426 * code while we sleep on uaddr. 2426 * code while we sleep on uaddr.
2427 */ 2427 */
2428 debug_rt_mutex_init_waiter(&rt_waiter); 2428 debug_rt_mutex_init_waiter(&rt_waiter);
2429 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2430 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2429 rt_waiter.task = NULL; 2431 rt_waiter.task = NULL;
2430 2432
2431 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2433 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 383319bae3f7..09094361dce5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,7 @@
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/sched/deadline.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
50#include <linux/freezer.h> 51#include <linux/freezer.h>
51 52
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1610 unsigned long slack; 1611 unsigned long slack;
1611 1612
1612 slack = current->timer_slack_ns; 1613 slack = current->timer_slack_ns;
1613 if (rt_task(current)) 1614 if (dl_task(current) || rt_task(current))
1614 slack = 0; 1615 slack = 0;
1615 1616
1616 hrtimer_init_on_stack(&t.timer, clockid, mode); 1617 hrtimer_init_on_stack(&t.timer, clockid, mode);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..49b2ed3dced8 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -24,7 +24,7 @@
24#include <linux/kallsyms.h> 24#include <linux/kallsyms.h>
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/plist.h> 27#include <linux/rbtree.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/debug_locks.h> 29#include <linux/debug_locks.h>
30 30
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
57 57
58void rt_mutex_debug_task_free(struct task_struct *task) 58void rt_mutex_debug_task_free(struct task_struct *task)
59{ 59{
60 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); 60 DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); 61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
62} 62}
63 63
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
154void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 154void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
155{ 155{
156 memset(waiter, 0x11, sizeof(*waiter)); 156 memset(waiter, 0x11, sizeof(*waiter));
157 plist_node_init(&waiter->list_entry, MAX_PRIO);
158 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
159 waiter->deadlock_task_pid = NULL; 157 waiter->deadlock_task_pid = NULL;
160} 158}
161 159
162void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 160void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
163{ 161{
164 put_pid(waiter->deadlock_task_pid); 162 put_pid(waiter->deadlock_task_pid);
165 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
166 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
167 memset(waiter, 0x22, sizeof(*waiter)); 163 memset(waiter, 0x22, sizeof(*waiter));
168} 164}
169 165
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..2e960a2bab81 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -14,6 +14,7 @@
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h> 16#include <linux/sched/rt.h>
17#include <linux/sched/deadline.h>
17#include <linux/timer.h> 18#include <linux/timer.h>
18 19
19#include "rtmutex_common.h" 20#include "rtmutex_common.h"
@@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
91} 92}
92#endif 93#endif
93 94
95static inline int
96rt_mutex_waiter_less(struct rt_mutex_waiter *left,
97 struct rt_mutex_waiter *right)
98{
99 if (left->prio < right->prio)
100 return 1;
101
102 /*
103 * If both waiters have dl_prio(), we check the deadlines of the
104 * associated tasks.
105 * If left waiter has a dl_prio(), and we didn't return 1 above,
106 * then right waiter has a dl_prio() too.
107 */
108 if (dl_prio(left->prio))
109 return (left->task->dl.deadline < right->task->dl.deadline);
110
111 return 0;
112}
113
114static void
115rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
116{
117 struct rb_node **link = &lock->waiters.rb_node;
118 struct rb_node *parent = NULL;
119 struct rt_mutex_waiter *entry;
120 int leftmost = 1;
121
122 while (*link) {
123 parent = *link;
124 entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
125 if (rt_mutex_waiter_less(waiter, entry)) {
126 link = &parent->rb_left;
127 } else {
128 link = &parent->rb_right;
129 leftmost = 0;
130 }
131 }
132
133 if (leftmost)
134 lock->waiters_leftmost = &waiter->tree_entry;
135
136 rb_link_node(&waiter->tree_entry, parent, link);
137 rb_insert_color(&waiter->tree_entry, &lock->waiters);
138}
139
140static void
141rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
142{
143 if (RB_EMPTY_NODE(&waiter->tree_entry))
144 return;
145
146 if (lock->waiters_leftmost == &waiter->tree_entry)
147 lock->waiters_leftmost = rb_next(&waiter->tree_entry);
148
149 rb_erase(&waiter->tree_entry, &lock->waiters);
150 RB_CLEAR_NODE(&waiter->tree_entry);
151}
152
153static void
154rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
155{
156 struct rb_node **link = &task->pi_waiters.rb_node;
157 struct rb_node *parent = NULL;
158 struct rt_mutex_waiter *entry;
159 int leftmost = 1;
160
161 while (*link) {
162 parent = *link;
163 entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
164 if (rt_mutex_waiter_less(waiter, entry)) {
165 link = &parent->rb_left;
166 } else {
167 link = &parent->rb_right;
168 leftmost = 0;
169 }
170 }
171
172 if (leftmost)
173 task->pi_waiters_leftmost = &waiter->pi_tree_entry;
174
175 rb_link_node(&waiter->pi_tree_entry, parent, link);
176 rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
177}
178
179static void
180rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
181{
182 if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
183 return;
184
185 if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
186 task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
187
188 rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
189 RB_CLEAR_NODE(&waiter->pi_tree_entry);
190}
191
94/* 192/*
95 * Calculate task priority from the waiter list priority 193 * Calculate task priority from the waiter tree priority
96 * 194 *
97 * Return task->normal_prio when the waiter list is empty or when 195 * Return task->normal_prio when the waiter tree is empty or when
98 * the waiter is not allowed to do priority boosting 196 * the waiter is not allowed to do priority boosting
99 */ 197 */
100int rt_mutex_getprio(struct task_struct *task) 198int rt_mutex_getprio(struct task_struct *task)
@@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task)
102 if (likely(!task_has_pi_waiters(task))) 200 if (likely(!task_has_pi_waiters(task)))
103 return task->normal_prio; 201 return task->normal_prio;
104 202
105 return min(task_top_pi_waiter(task)->pi_list_entry.prio, 203 return min(task_top_pi_waiter(task)->prio,
106 task->normal_prio); 204 task->normal_prio);
107} 205}
108 206
207struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
208{
209 if (likely(!task_has_pi_waiters(task)))
210 return NULL;
211
212 return task_top_pi_waiter(task)->task;
213}
214
109/* 215/*
110 * Adjust the priority of a task, after its pi_waiters got modified. 216 * Adjust the priority of a task, after its pi_waiters got modified.
111 * 217 *
@@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
115{ 221{
116 int prio = rt_mutex_getprio(task); 222 int prio = rt_mutex_getprio(task);
117 223
118 if (task->prio != prio) 224 if (task->prio != prio || dl_prio(prio))
119 rt_mutex_setprio(task, prio); 225 rt_mutex_setprio(task, prio);
120} 226}
121 227
@@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
233 * When deadlock detection is off then we check, if further 339 * When deadlock detection is off then we check, if further
234 * priority adjustment is necessary. 340 * priority adjustment is necessary.
235 */ 341 */
236 if (!detect_deadlock && waiter->list_entry.prio == task->prio) 342 if (!detect_deadlock && waiter->prio == task->prio)
237 goto out_unlock_pi; 343 goto out_unlock_pi;
238 344
239 lock = waiter->lock; 345 lock = waiter->lock;
@@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 top_waiter = rt_mutex_top_waiter(lock); 360 top_waiter = rt_mutex_top_waiter(lock);
255 361
256 /* Requeue the waiter */ 362 /* Requeue the waiter */
257 plist_del(&waiter->list_entry, &lock->wait_list); 363 rt_mutex_dequeue(lock, waiter);
258 waiter->list_entry.prio = task->prio; 364 waiter->prio = task->prio;
259 plist_add(&waiter->list_entry, &lock->wait_list); 365 rt_mutex_enqueue(lock, waiter);
260 366
261 /* Release the task */ 367 /* Release the task */
262 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 368 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
280 386
281 if (waiter == rt_mutex_top_waiter(lock)) { 387 if (waiter == rt_mutex_top_waiter(lock)) {
282 /* Boost the owner */ 388 /* Boost the owner */
283 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); 389 rt_mutex_dequeue_pi(task, top_waiter);
284 waiter->pi_list_entry.prio = waiter->list_entry.prio; 390 rt_mutex_enqueue_pi(task, waiter);
285 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
286 __rt_mutex_adjust_prio(task); 391 __rt_mutex_adjust_prio(task);
287 392
288 } else if (top_waiter == waiter) { 393 } else if (top_waiter == waiter) {
289 /* Deboost the owner */ 394 /* Deboost the owner */
290 plist_del(&waiter->pi_list_entry, &task->pi_waiters); 395 rt_mutex_dequeue_pi(task, waiter);
291 waiter = rt_mutex_top_waiter(lock); 396 waiter = rt_mutex_top_waiter(lock);
292 waiter->pi_list_entry.prio = waiter->list_entry.prio; 397 rt_mutex_enqueue_pi(task, waiter);
293 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
294 __rt_mutex_adjust_prio(task); 398 __rt_mutex_adjust_prio(task);
295 } 399 }
296 400
@@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
355 * 3) it is top waiter 459 * 3) it is top waiter
356 */ 460 */
357 if (rt_mutex_has_waiters(lock)) { 461 if (rt_mutex_has_waiters(lock)) {
358 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { 462 if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
359 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 463 if (!waiter || waiter != rt_mutex_top_waiter(lock))
360 return 0; 464 return 0;
361 } 465 }
@@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
369 473
370 /* remove the queued waiter. */ 474 /* remove the queued waiter. */
371 if (waiter) { 475 if (waiter) {
372 plist_del(&waiter->list_entry, &lock->wait_list); 476 rt_mutex_dequeue(lock, waiter);
373 task->pi_blocked_on = NULL; 477 task->pi_blocked_on = NULL;
374 } 478 }
375 479
@@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
379 */ 483 */
380 if (rt_mutex_has_waiters(lock)) { 484 if (rt_mutex_has_waiters(lock)) {
381 top = rt_mutex_top_waiter(lock); 485 top = rt_mutex_top_waiter(lock);
382 top->pi_list_entry.prio = top->list_entry.prio; 486 rt_mutex_enqueue_pi(task, top);
383 plist_add(&top->pi_list_entry, &task->pi_waiters);
384 } 487 }
385 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 488 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
386 } 489 }
@@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
416 __rt_mutex_adjust_prio(task); 519 __rt_mutex_adjust_prio(task);
417 waiter->task = task; 520 waiter->task = task;
418 waiter->lock = lock; 521 waiter->lock = lock;
419 plist_node_init(&waiter->list_entry, task->prio); 522 waiter->prio = task->prio;
420 plist_node_init(&waiter->pi_list_entry, task->prio);
421 523
422 /* Get the top priority waiter on the lock */ 524 /* Get the top priority waiter on the lock */
423 if (rt_mutex_has_waiters(lock)) 525 if (rt_mutex_has_waiters(lock))
424 top_waiter = rt_mutex_top_waiter(lock); 526 top_waiter = rt_mutex_top_waiter(lock);
425 plist_add(&waiter->list_entry, &lock->wait_list); 527 rt_mutex_enqueue(lock, waiter);
426 528
427 task->pi_blocked_on = waiter; 529 task->pi_blocked_on = waiter;
428 530
@@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
433 535
434 if (waiter == rt_mutex_top_waiter(lock)) { 536 if (waiter == rt_mutex_top_waiter(lock)) {
435 raw_spin_lock_irqsave(&owner->pi_lock, flags); 537 raw_spin_lock_irqsave(&owner->pi_lock, flags);
436 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 538 rt_mutex_dequeue_pi(owner, top_waiter);
437 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 539 rt_mutex_enqueue_pi(owner, waiter);
438 540
439 __rt_mutex_adjust_prio(owner); 541 __rt_mutex_adjust_prio(owner);
440 if (owner->pi_blocked_on) 542 if (owner->pi_blocked_on)
@@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
486 * boosted mode and go back to normal after releasing 588 * boosted mode and go back to normal after releasing
487 * lock->wait_lock. 589 * lock->wait_lock.
488 */ 590 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 591 rt_mutex_dequeue_pi(current, waiter);
490 592
491 rt_mutex_set_owner(lock, NULL); 593 rt_mutex_set_owner(lock, NULL);
492 594
@@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock,
510 int chain_walk = 0; 612 int chain_walk = 0;
511 613
512 raw_spin_lock_irqsave(&current->pi_lock, flags); 614 raw_spin_lock_irqsave(&current->pi_lock, flags);
513 plist_del(&waiter->list_entry, &lock->wait_list); 615 rt_mutex_dequeue(lock, waiter);
514 current->pi_blocked_on = NULL; 616 current->pi_blocked_on = NULL;
515 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 617 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
516 618
@@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock,
521 623
522 raw_spin_lock_irqsave(&owner->pi_lock, flags); 624 raw_spin_lock_irqsave(&owner->pi_lock, flags);
523 625
524 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 626 rt_mutex_dequeue_pi(owner, waiter);
525 627
526 if (rt_mutex_has_waiters(lock)) { 628 if (rt_mutex_has_waiters(lock)) {
527 struct rt_mutex_waiter *next; 629 struct rt_mutex_waiter *next;
528 630
529 next = rt_mutex_top_waiter(lock); 631 next = rt_mutex_top_waiter(lock);
530 plist_add(&next->pi_list_entry, &owner->pi_waiters); 632 rt_mutex_enqueue_pi(owner, next);
531 } 633 }
532 __rt_mutex_adjust_prio(owner); 634 __rt_mutex_adjust_prio(owner);
533 635
@@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock,
537 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 639 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
538 } 640 }
539 641
540 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
541
542 if (!chain_walk) 642 if (!chain_walk)
543 return; 643 return;
544 644
@@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
565 raw_spin_lock_irqsave(&task->pi_lock, flags); 665 raw_spin_lock_irqsave(&task->pi_lock, flags);
566 666
567 waiter = task->pi_blocked_on; 667 waiter = task->pi_blocked_on;
568 if (!waiter || waiter->list_entry.prio == task->prio) { 668 if (!waiter || (waiter->prio == task->prio &&
669 !dl_prio(task->prio))) {
569 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 670 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
570 return; 671 return;
571 } 672 }
@@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
638 int ret = 0; 739 int ret = 0;
639 740
640 debug_rt_mutex_init_waiter(&waiter); 741 debug_rt_mutex_init_waiter(&waiter);
742 RB_CLEAR_NODE(&waiter.pi_tree_entry);
743 RB_CLEAR_NODE(&waiter.tree_entry);
641 744
642 raw_spin_lock(&lock->wait_lock); 745 raw_spin_lock(&lock->wait_lock);
643 746
@@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
904{ 1007{
905 lock->owner = NULL; 1008 lock->owner = NULL;
906 raw_spin_lock_init(&lock->wait_lock); 1009 raw_spin_lock_init(&lock->wait_lock);
907 plist_head_init(&lock->wait_list); 1010 lock->waiters = RB_ROOT;
1011 lock->waiters_leftmost = NULL;
908 1012
909 debug_rt_mutex_init(lock, name); 1013 debug_rt_mutex_init(lock, name);
910} 1014}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..7431a9c86f35 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
40 * This is the control structure for tasks blocked on a rt_mutex, 40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task. 41 * which is allocated on the kernel stack on of the blocked task.
42 * 42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list 43 * @tree_entry: pi node to enqueue into the mutex waiters tree
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list 44 * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
45 * @task: task reference to the blocked task 45 * @task: task reference to the blocked task
46 */ 46 */
47struct rt_mutex_waiter { 47struct rt_mutex_waiter {
48 struct plist_node list_entry; 48 struct rb_node tree_entry;
49 struct plist_node pi_list_entry; 49 struct rb_node pi_tree_entry;
50 struct task_struct *task; 50 struct task_struct *task;
51 struct rt_mutex *lock; 51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES 52#ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -54,14 +54,15 @@ struct rt_mutex_waiter {
54 struct pid *deadlock_task_pid; 54 struct pid *deadlock_task_pid;
55 struct rt_mutex *deadlock_lock; 55 struct rt_mutex *deadlock_lock;
56#endif 56#endif
57 int prio;
57}; 58};
58 59
59/* 60/*
60 * Various helpers to access the waiters-plist: 61 * Various helpers to access the waiters-tree:
61 */ 62 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock) 63static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{ 64{
64 return !plist_head_empty(&lock->wait_list); 65 return !RB_EMPTY_ROOT(&lock->waiters);
65} 66}
66 67
67static inline struct rt_mutex_waiter * 68static inline struct rt_mutex_waiter *
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
69{ 70{
70 struct rt_mutex_waiter *w; 71 struct rt_mutex_waiter *w;
71 72
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, 73 w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
73 list_entry); 74 tree_entry);
74 BUG_ON(w->lock != lock); 75 BUG_ON(w->lock != lock);
75 76
76 return w; 77 return w;
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
78 79
79static inline int task_has_pi_waiters(struct task_struct *p) 80static inline int task_has_pi_waiters(struct task_struct *p)
80{ 81{
81 return !plist_head_empty(&p->pi_waiters); 82 return !RB_EMPTY_ROOT(&p->pi_waiters);
82} 83}
83 84
84static inline struct rt_mutex_waiter * 85static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p) 86task_top_pi_waiter(struct task_struct *p)
86{ 87{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, 88 return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
88 pi_list_entry); 89 pi_tree_entry);
89} 90}
90 91
91/* 92/*
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15obj-y += wait.o completion.o 16obj-y += wait.o completion.o
16obj-$(CONFIG_SMP) += cpupri.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
18obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
19obj-$(CONFIG_SCHED_DEBUG) += debug.o 20obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c3ae1446461c..6bd6a6731b21 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current cpu.
31 * 30 *
31 * sched_clock_cpu(i)
32 *
32 * How: 33 * How:
33 * 34 *
34 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 51 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
51 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
52 * 53 *
53 *
54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
62 */ 54 */
63#include <linux/spinlock.h> 55#include <linux/spinlock.h>
64#include <linux/hardirq.h> 56#include <linux/hardirq.h>
@@ -66,6 +58,8 @@
66#include <linux/percpu.h> 58#include <linux/percpu.h>
67#include <linux/ktime.h> 59#include <linux/ktime.h>
68#include <linux/sched.h> 60#include <linux/sched.h>
61#include <linux/static_key.h>
62#include <linux/workqueue.h>
69 63
70/* 64/*
71 * Scheduler clock - returns current time in nanosec units. 65 * Scheduler clock - returns current time in nanosec units.
@@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock);
82__read_mostly int sched_clock_running; 76__read_mostly int sched_clock_running;
83 77
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 78#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 79static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
80
81int sched_clock_stable(void)
82{
83 if (static_key_false(&__sched_clock_stable))
84 return false;
85 return true;
86}
87
88void set_sched_clock_stable(void)
89{
90 if (!sched_clock_stable())
91 static_key_slow_dec(&__sched_clock_stable);
92}
93
94static void __clear_sched_clock_stable(struct work_struct *work)
95{
96 /* XXX worry about clock continuity */
97 if (sched_clock_stable())
98 static_key_slow_inc(&__sched_clock_stable);
99}
100
101static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
102
103void clear_sched_clock_stable(void)
104{
105 if (keventd_up())
106 schedule_work(&sched_clock_work);
107 else
108 __clear_sched_clock_stable(&sched_clock_work);
109}
86 110
87struct sched_clock_data { 111struct sched_clock_data {
88 u64 tick_raw; 112 u64 tick_raw;
@@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu)
242 struct sched_clock_data *scd; 266 struct sched_clock_data *scd;
243 u64 clock; 267 u64 clock;
244 268
245 WARN_ON_ONCE(!irqs_disabled()); 269 if (sched_clock_stable())
246
247 if (sched_clock_stable)
248 return sched_clock(); 270 return sched_clock();
249 271
250 if (unlikely(!sched_clock_running)) 272 if (unlikely(!sched_clock_running))
251 return 0ull; 273 return 0ull;
252 274
275 preempt_disable();
253 scd = cpu_sdc(cpu); 276 scd = cpu_sdc(cpu);
254 277
255 if (cpu != smp_processor_id()) 278 if (cpu != smp_processor_id())
256 clock = sched_clock_remote(scd); 279 clock = sched_clock_remote(scd);
257 else 280 else
258 clock = sched_clock_local(scd); 281 clock = sched_clock_local(scd);
282 preempt_enable();
259 283
260 return clock; 284 return clock;
261} 285}
@@ -265,7 +289,7 @@ void sched_clock_tick(void)
265 struct sched_clock_data *scd; 289 struct sched_clock_data *scd;
266 u64 now, now_gtod; 290 u64 now, now_gtod;
267 291
268 if (sched_clock_stable) 292 if (sched_clock_stable())
269 return; 293 return;
270 294
271 if (unlikely(!sched_clock_running)) 295 if (unlikely(!sched_clock_running))
@@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
316 */ 340 */
317u64 cpu_clock(int cpu) 341u64 cpu_clock(int cpu)
318{ 342{
319 u64 clock; 343 if (static_key_false(&__sched_clock_stable))
320 unsigned long flags; 344 return sched_clock_cpu(cpu);
321
322 local_irq_save(flags);
323 clock = sched_clock_cpu(cpu);
324 local_irq_restore(flags);
325 345
326 return clock; 346 return sched_clock();
327} 347}
328 348
329/* 349/*
@@ -335,14 +355,10 @@ u64 cpu_clock(int cpu)
335 */ 355 */
336u64 local_clock(void) 356u64 local_clock(void)
337{ 357{
338 u64 clock; 358 if (static_key_false(&__sched_clock_stable))
339 unsigned long flags; 359 return sched_clock_cpu(raw_smp_processor_id());
340 360
341 local_irq_save(flags); 361 return sched_clock();
342 clock = sched_clock_cpu(smp_processor_id());
343 local_irq_restore(flags);
344
345 return clock;
346} 362}
347 363
348#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 364#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu)
362 378
363u64 cpu_clock(int cpu) 379u64 cpu_clock(int cpu)
364{ 380{
365 return sched_clock_cpu(cpu); 381 return sched_clock();
366} 382}
367 383
368u64 local_clock(void) 384u64 local_clock(void)
369{ 385{
370 return sched_clock_cpu(0); 386 return sched_clock();
371} 387}
372 388
373#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 389#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a88f4a485c5e..36c951b7eef8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299
300
301/* 299/*
302 * __task_rq_lock - lock the rq @p resides on. 300 * __task_rq_lock - lock the rq @p resides on.
303 */ 301 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 897{
900 int prio; 898 int prio;
901 899
902 if (task_has_rt_policy(p)) 900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 904 else
905 prio = __normal_prio(p); 905 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 if (prev_class->switched_from) 945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p); 946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p); 947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio) 948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio); 949 p->sched_class->prio_changed(rq, p, oldprio);
950} 950}
951 951
@@ -1499,8 +1499,7 @@ void scheduler_ipi(void)
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI. 1500 * this IPI.
1501 */ 1501 */
1502 if (tif_need_resched()) 1502 preempt_fold_need_resched();
1503 set_preempt_need_resched();
1504 1503
1505 if (llist_empty(&this_rq()->wake_list) 1504 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id()) 1505 && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1716,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1716 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1717#endif
1719 1718
1719 RB_CLEAR_NODE(&p->dl.rb_node);
1720 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1721 p->dl.dl_runtime = p->dl.runtime = 0;
1722 p->dl.dl_deadline = p->dl.deadline = 0;
1723 p->dl.dl_period = 0;
1724 p->dl.flags = 0;
1725
1720 INIT_LIST_HEAD(&p->rt.run_list); 1726 INIT_LIST_HEAD(&p->rt.run_list);
1721 1727
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1728#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1768,7 +1774,7 @@ void set_numabalancing_state(bool enabled)
1768/* 1774/*
1769 * fork()/clone()-time setup: 1775 * fork()/clone()-time setup:
1770 */ 1776 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1777int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1778{
1773 unsigned long flags; 1779 unsigned long flags;
1774 int cpu = get_cpu(); 1780 int cpu = get_cpu();
@@ -1790,7 +1796,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1796 * Revert to default priority/policy on fork if requested.
1791 */ 1797 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1798 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1799 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1800 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1801 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1802 p->rt_priority = 0;
@@ -1807,8 +1813,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1813 p->sched_reset_on_fork = 0;
1808 } 1814 }
1809 1815
1810 if (!rt_prio(p->prio)) 1816 if (dl_prio(p->prio)) {
1817 put_cpu();
1818 return -EAGAIN;
1819 } else if (rt_prio(p->prio)) {
1820 p->sched_class = &rt_sched_class;
1821 } else {
1811 p->sched_class = &fair_sched_class; 1822 p->sched_class = &fair_sched_class;
1823 }
1812 1824
1813 if (p->sched_class->task_fork) 1825 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1826 p->sched_class->task_fork(p);
@@ -1834,11 +1846,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1834 init_task_preempt_count(p); 1846 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP 1847#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1848 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1849 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1837#endif 1850#endif
1838 1851
1839 put_cpu(); 1852 put_cpu();
1853 return 0;
1854}
1855
1856unsigned long to_ratio(u64 period, u64 runtime)
1857{
1858 if (runtime == RUNTIME_INF)
1859 return 1ULL << 20;
1860
1861 /*
1862 * Doing this here saves a lot of checks in all
1863 * the calling paths, and returning zero seems
1864 * safe for them anyway.
1865 */
1866 if (period == 0)
1867 return 0;
1868
1869 return div64_u64(runtime << 20, period);
1870}
1871
1872#ifdef CONFIG_SMP
1873inline struct dl_bw *dl_bw_of(int i)
1874{
1875 return &cpu_rq(i)->rd->dl_bw;
1840} 1876}
1841 1877
1878static inline int dl_bw_cpus(int i)
1879{
1880 struct root_domain *rd = cpu_rq(i)->rd;
1881 int cpus = 0;
1882
1883 for_each_cpu_and(i, rd->span, cpu_active_mask)
1884 cpus++;
1885
1886 return cpus;
1887}
1888#else
1889inline struct dl_bw *dl_bw_of(int i)
1890{
1891 return &cpu_rq(i)->dl.dl_bw;
1892}
1893
1894static inline int dl_bw_cpus(int i)
1895{
1896 return 1;
1897}
1898#endif
1899
1900static inline
1901void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1902{
1903 dl_b->total_bw -= tsk_bw;
1904}
1905
1906static inline
1907void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1908{
1909 dl_b->total_bw += tsk_bw;
1910}
1911
1912static inline
1913bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1914{
1915 return dl_b->bw != -1 &&
1916 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1917}
1918
1919/*
1920 * We must be sure that accepting a new task (or allowing changing the
1921 * parameters of an existing one) is consistent with the bandwidth
1922 * constraints. If yes, this function also accordingly updates the currently
1923 * allocated bandwidth to reflect the new situation.
1924 *
1925 * This function is called while holding p's rq->lock.
1926 */
1927static int dl_overflow(struct task_struct *p, int policy,
1928 const struct sched_attr *attr)
1929{
1930
1931 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1932 u64 period = attr->sched_period;
1933 u64 runtime = attr->sched_runtime;
1934 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1935 int cpus, err = -1;
1936
1937 if (new_bw == p->dl.dl_bw)
1938 return 0;
1939
1940 /*
1941 * Either if a task, enters, leave, or stays -deadline but changes
1942 * its parameters, we may need to update accordingly the total
1943 * allocated bandwidth of the container.
1944 */
1945 raw_spin_lock(&dl_b->lock);
1946 cpus = dl_bw_cpus(task_cpu(p));
1947 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1948 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1949 __dl_add(dl_b, new_bw);
1950 err = 0;
1951 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1952 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1953 __dl_clear(dl_b, p->dl.dl_bw);
1954 __dl_add(dl_b, new_bw);
1955 err = 0;
1956 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1957 __dl_clear(dl_b, p->dl.dl_bw);
1958 err = 0;
1959 }
1960 raw_spin_unlock(&dl_b->lock);
1961
1962 return err;
1963}
1964
1965extern void init_dl_bw(struct dl_bw *dl_b);
1966
1842/* 1967/*
1843 * wake_up_new_task - wake up a newly created task for the first time. 1968 * wake_up_new_task - wake up a newly created task for the first time.
1844 * 1969 *
@@ -2003,6 +2128,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2003 if (unlikely(prev_state == TASK_DEAD)) { 2128 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev); 2129 task_numa_free(prev);
2005 2130
2131 if (prev->sched_class->task_dead)
2132 prev->sched_class->task_dead(prev);
2133
2006 /* 2134 /*
2007 * Remove function-return probe instances associated with this 2135 * Remove function-return probe instances associated with this
2008 * task and put them back on the free list. 2136 * task and put them back on the free list.
@@ -2296,7 +2424,7 @@ void scheduler_tick(void)
2296 2424
2297#ifdef CONFIG_SMP 2425#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu); 2426 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu); 2427 trigger_load_balance(rq);
2300#endif 2428#endif
2301 rq_last_tick_reset(rq); 2429 rq_last_tick_reset(rq);
2302} 2430}
@@ -2414,10 +2542,10 @@ static inline void schedule_debug(struct task_struct *prev)
2414{ 2542{
2415 /* 2543 /*
2416 * Test if we are atomic. Since do_exit() needs to call into 2544 * Test if we are atomic. Since do_exit() needs to call into
2417 * schedule() atomically, we ignore that path for now. 2545 * schedule() atomically, we ignore that path. Otherwise whine
2418 * Otherwise, whine if we are scheduling when we should not be. 2546 * if we are scheduling when we should not.
2419 */ 2547 */
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2548 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2421 __schedule_bug(prev); 2549 __schedule_bug(prev);
2422 rcu_sleep_check(); 2550 rcu_sleep_check();
2423 2551
@@ -2761,11 +2889,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
2761 */ 2889 */
2762void rt_mutex_setprio(struct task_struct *p, int prio) 2890void rt_mutex_setprio(struct task_struct *p, int prio)
2763{ 2891{
2764 int oldprio, on_rq, running; 2892 int oldprio, on_rq, running, enqueue_flag = 0;
2765 struct rq *rq; 2893 struct rq *rq;
2766 const struct sched_class *prev_class; 2894 const struct sched_class *prev_class;
2767 2895
2768 BUG_ON(prio < 0 || prio > MAX_PRIO); 2896 BUG_ON(prio > MAX_PRIO);
2769 2897
2770 rq = __task_rq_lock(p); 2898 rq = __task_rq_lock(p);
2771 2899
@@ -2788,6 +2916,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2788 } 2916 }
2789 2917
2790 trace_sched_pi_setprio(p, prio); 2918 trace_sched_pi_setprio(p, prio);
2919 p->pi_top_task = rt_mutex_get_top_task(p);
2791 oldprio = p->prio; 2920 oldprio = p->prio;
2792 prev_class = p->sched_class; 2921 prev_class = p->sched_class;
2793 on_rq = p->on_rq; 2922 on_rq = p->on_rq;
@@ -2797,23 +2926,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2797 if (running) 2926 if (running)
2798 p->sched_class->put_prev_task(rq, p); 2927 p->sched_class->put_prev_task(rq, p);
2799 2928
2800 if (rt_prio(prio)) 2929 /*
2930 * Boosting condition are:
2931 * 1. -rt task is running and holds mutex A
2932 * --> -dl task blocks on mutex A
2933 *
2934 * 2. -dl task is running and holds mutex A
2935 * --> -dl task blocks on mutex A and could preempt the
2936 * running task
2937 */
2938 if (dl_prio(prio)) {
2939 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2940 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2941 p->dl.dl_boosted = 1;
2942 p->dl.dl_throttled = 0;
2943 enqueue_flag = ENQUEUE_REPLENISH;
2944 } else
2945 p->dl.dl_boosted = 0;
2946 p->sched_class = &dl_sched_class;
2947 } else if (rt_prio(prio)) {
2948 if (dl_prio(oldprio))
2949 p->dl.dl_boosted = 0;
2950 if (oldprio < prio)
2951 enqueue_flag = ENQUEUE_HEAD;
2801 p->sched_class = &rt_sched_class; 2952 p->sched_class = &rt_sched_class;
2802 else 2953 } else {
2954 if (dl_prio(oldprio))
2955 p->dl.dl_boosted = 0;
2803 p->sched_class = &fair_sched_class; 2956 p->sched_class = &fair_sched_class;
2957 }
2804 2958
2805 p->prio = prio; 2959 p->prio = prio;
2806 2960
2807 if (running) 2961 if (running)
2808 p->sched_class->set_curr_task(rq); 2962 p->sched_class->set_curr_task(rq);
2809 if (on_rq) 2963 if (on_rq)
2810 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2964 enqueue_task(rq, p, enqueue_flag);
2811 2965
2812 check_class_changed(rq, p, prev_class, oldprio); 2966 check_class_changed(rq, p, prev_class, oldprio);
2813out_unlock: 2967out_unlock:
2814 __task_rq_unlock(rq); 2968 __task_rq_unlock(rq);
2815} 2969}
2816#endif 2970#endif
2971
2817void set_user_nice(struct task_struct *p, long nice) 2972void set_user_nice(struct task_struct *p, long nice)
2818{ 2973{
2819 int old_prio, delta, on_rq; 2974 int old_prio, delta, on_rq;
@@ -2831,9 +2986,9 @@ void set_user_nice(struct task_struct *p, long nice)
2831 * The RT priorities are set via sched_setscheduler(), but we still 2986 * The RT priorities are set via sched_setscheduler(), but we still
2832 * allow the 'normal' nice value to be set - but as expected 2987 * allow the 'normal' nice value to be set - but as expected
2833 * it wont have any effect on scheduling until the task is 2988 * it wont have any effect on scheduling until the task is
2834 * SCHED_FIFO/SCHED_RR: 2989 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2835 */ 2990 */
2836 if (task_has_rt_policy(p)) { 2991 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2837 p->static_prio = NICE_TO_PRIO(nice); 2992 p->static_prio = NICE_TO_PRIO(nice);
2838 goto out_unlock; 2993 goto out_unlock;
2839 } 2994 }
@@ -2988,22 +3143,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2988 return pid ? find_task_by_vpid(pid) : current; 3143 return pid ? find_task_by_vpid(pid) : current;
2989} 3144}
2990 3145
2991/* Actually do priority change: must hold rq lock. */ 3146/*
3147 * This function initializes the sched_dl_entity of a newly becoming
3148 * SCHED_DEADLINE task.
3149 *
3150 * Only the static values are considered here, the actual runtime and the
3151 * absolute deadline will be properly calculated when the task is enqueued
3152 * for the first time with its new policy.
3153 */
2992static void 3154static void
2993__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3155__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3156{
3157 struct sched_dl_entity *dl_se = &p->dl;
3158
3159 init_dl_task_timer(dl_se);
3160 dl_se->dl_runtime = attr->sched_runtime;
3161 dl_se->dl_deadline = attr->sched_deadline;
3162 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3163 dl_se->flags = attr->sched_flags;
3164 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3165 dl_se->dl_throttled = 0;
3166 dl_se->dl_new = 1;
3167}
3168
3169/* Actually do priority change: must hold pi & rq lock. */
3170static void __setscheduler(struct rq *rq, struct task_struct *p,
3171 const struct sched_attr *attr)
2994{ 3172{
3173 int policy = attr->sched_policy;
3174
3175 if (policy == -1) /* setparam */
3176 policy = p->policy;
3177
2995 p->policy = policy; 3178 p->policy = policy;
2996 p->rt_priority = prio; 3179
3180 if (dl_policy(policy))
3181 __setparam_dl(p, attr);
3182 else if (fair_policy(policy))
3183 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3184
3185 /*
3186 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3187 * !rt_policy. Always setting this ensures that things like
3188 * getparam()/getattr() don't report silly values for !rt tasks.
3189 */
3190 p->rt_priority = attr->sched_priority;
3191
2997 p->normal_prio = normal_prio(p); 3192 p->normal_prio = normal_prio(p);
2998 /* we are holding p->pi_lock already */
2999 p->prio = rt_mutex_getprio(p); 3193 p->prio = rt_mutex_getprio(p);
3000 if (rt_prio(p->prio)) 3194
3195 if (dl_prio(p->prio))
3196 p->sched_class = &dl_sched_class;
3197 else if (rt_prio(p->prio))
3001 p->sched_class = &rt_sched_class; 3198 p->sched_class = &rt_sched_class;
3002 else 3199 else
3003 p->sched_class = &fair_sched_class; 3200 p->sched_class = &fair_sched_class;
3201
3004 set_load_weight(p); 3202 set_load_weight(p);
3005} 3203}
3006 3204
3205static void
3206__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3207{
3208 struct sched_dl_entity *dl_se = &p->dl;
3209
3210 attr->sched_priority = p->rt_priority;
3211 attr->sched_runtime = dl_se->dl_runtime;
3212 attr->sched_deadline = dl_se->dl_deadline;
3213 attr->sched_period = dl_se->dl_period;
3214 attr->sched_flags = dl_se->flags;
3215}
3216
3217/*
3218 * This function validates the new parameters of a -deadline task.
3219 * We ask for the deadline not being zero, and greater or equal
3220 * than the runtime, as well as the period of being zero or
3221 * greater than deadline. Furthermore, we have to be sure that
3222 * user parameters are above the internal resolution (1us); we
3223 * check sched_runtime only since it is always the smaller one.
3224 */
3225static bool
3226__checkparam_dl(const struct sched_attr *attr)
3227{
3228 return attr && attr->sched_deadline != 0 &&
3229 (attr->sched_period == 0 ||
3230 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3231 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3232 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3233}
3234
3007/* 3235/*
3008 * check the target process has a UID that matches the current process's 3236 * check the target process has a UID that matches the current process's
3009 */ 3237 */
@@ -3020,10 +3248,12 @@ static bool check_same_owner(struct task_struct *p)
3020 return match; 3248 return match;
3021} 3249}
3022 3250
3023static int __sched_setscheduler(struct task_struct *p, int policy, 3251static int __sched_setscheduler(struct task_struct *p,
3024 const struct sched_param *param, bool user) 3252 const struct sched_attr *attr,
3253 bool user)
3025{ 3254{
3026 int retval, oldprio, oldpolicy = -1, on_rq, running; 3255 int retval, oldprio, oldpolicy = -1, on_rq, running;
3256 int policy = attr->sched_policy;
3027 unsigned long flags; 3257 unsigned long flags;
3028 const struct sched_class *prev_class; 3258 const struct sched_class *prev_class;
3029 struct rq *rq; 3259 struct rq *rq;
@@ -3037,31 +3267,40 @@ recheck:
3037 reset_on_fork = p->sched_reset_on_fork; 3267 reset_on_fork = p->sched_reset_on_fork;
3038 policy = oldpolicy = p->policy; 3268 policy = oldpolicy = p->policy;
3039 } else { 3269 } else {
3040 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3270 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3041 policy &= ~SCHED_RESET_ON_FORK;
3042 3271
3043 if (policy != SCHED_FIFO && policy != SCHED_RR && 3272 if (policy != SCHED_DEADLINE &&
3273 policy != SCHED_FIFO && policy != SCHED_RR &&
3044 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3274 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3045 policy != SCHED_IDLE) 3275 policy != SCHED_IDLE)
3046 return -EINVAL; 3276 return -EINVAL;
3047 } 3277 }
3048 3278
3279 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3280 return -EINVAL;
3281
3049 /* 3282 /*
3050 * Valid priorities for SCHED_FIFO and SCHED_RR are 3283 * Valid priorities for SCHED_FIFO and SCHED_RR are
3051 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3284 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3052 * SCHED_BATCH and SCHED_IDLE is 0. 3285 * SCHED_BATCH and SCHED_IDLE is 0.
3053 */ 3286 */
3054 if (param->sched_priority < 0 || 3287 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3055 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3288 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3056 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3057 return -EINVAL; 3289 return -EINVAL;
3058 if (rt_policy(policy) != (param->sched_priority != 0)) 3290 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3291 (rt_policy(policy) != (attr->sched_priority != 0)))
3059 return -EINVAL; 3292 return -EINVAL;
3060 3293
3061 /* 3294 /*
3062 * Allow unprivileged RT tasks to decrease priority: 3295 * Allow unprivileged RT tasks to decrease priority:
3063 */ 3296 */
3064 if (user && !capable(CAP_SYS_NICE)) { 3297 if (user && !capable(CAP_SYS_NICE)) {
3298 if (fair_policy(policy)) {
3299 if (attr->sched_nice < TASK_NICE(p) &&
3300 !can_nice(p, attr->sched_nice))
3301 return -EPERM;
3302 }
3303
3065 if (rt_policy(policy)) { 3304 if (rt_policy(policy)) {
3066 unsigned long rlim_rtprio = 3305 unsigned long rlim_rtprio =
3067 task_rlimit(p, RLIMIT_RTPRIO); 3306 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,8 +3310,8 @@ recheck:
3071 return -EPERM; 3310 return -EPERM;
3072 3311
3073 /* can't increase priority */ 3312 /* can't increase priority */
3074 if (param->sched_priority > p->rt_priority && 3313 if (attr->sched_priority > p->rt_priority &&
3075 param->sched_priority > rlim_rtprio) 3314 attr->sched_priority > rlim_rtprio)
3076 return -EPERM; 3315 return -EPERM;
3077 } 3316 }
3078 3317
@@ -3120,14 +3359,21 @@ recheck:
3120 /* 3359 /*
3121 * If not changing anything there's no need to proceed further: 3360 * If not changing anything there's no need to proceed further:
3122 */ 3361 */
3123 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3362 if (unlikely(policy == p->policy)) {
3124 param->sched_priority == p->rt_priority))) { 3363 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3364 goto change;
3365 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3366 goto change;
3367 if (dl_policy(policy))
3368 goto change;
3369
3125 task_rq_unlock(rq, p, &flags); 3370 task_rq_unlock(rq, p, &flags);
3126 return 0; 3371 return 0;
3127 } 3372 }
3373change:
3128 3374
3129#ifdef CONFIG_RT_GROUP_SCHED
3130 if (user) { 3375 if (user) {
3376#ifdef CONFIG_RT_GROUP_SCHED
3131 /* 3377 /*
3132 * Do not allow realtime tasks into groups that have no runtime 3378 * Do not allow realtime tasks into groups that have no runtime
3133 * assigned. 3379 * assigned.
@@ -3138,8 +3384,24 @@ recheck:
3138 task_rq_unlock(rq, p, &flags); 3384 task_rq_unlock(rq, p, &flags);
3139 return -EPERM; 3385 return -EPERM;
3140 } 3386 }
3141 }
3142#endif 3387#endif
3388#ifdef CONFIG_SMP
3389 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3390 cpumask_t *span = rq->rd->span;
3391
3392 /*
3393 * Don't allow tasks with an affinity mask smaller than
3394 * the entire root_domain to become SCHED_DEADLINE. We
3395 * will also fail if there's no bandwidth available.
3396 */
3397 if (!cpumask_subset(span, &p->cpus_allowed) ||
3398 rq->rd->dl_bw.bw == 0) {
3399 task_rq_unlock(rq, p, &flags);
3400 return -EPERM;
3401 }
3402 }
3403#endif
3404 }
3143 3405
3144 /* recheck policy now with rq lock held */ 3406 /* recheck policy now with rq lock held */
3145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3407 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3409,17 @@ recheck:
3147 task_rq_unlock(rq, p, &flags); 3409 task_rq_unlock(rq, p, &flags);
3148 goto recheck; 3410 goto recheck;
3149 } 3411 }
3412
3413 /*
3414 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3415 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3416 * is available.
3417 */
3418 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3419 task_rq_unlock(rq, p, &flags);
3420 return -EBUSY;
3421 }
3422
3150 on_rq = p->on_rq; 3423 on_rq = p->on_rq;
3151 running = task_current(rq, p); 3424 running = task_current(rq, p);
3152 if (on_rq) 3425 if (on_rq)
@@ -3158,7 +3431,7 @@ recheck:
3158 3431
3159 oldprio = p->prio; 3432 oldprio = p->prio;
3160 prev_class = p->sched_class; 3433 prev_class = p->sched_class;
3161 __setscheduler(rq, p, policy, param->sched_priority); 3434 __setscheduler(rq, p, attr);
3162 3435
3163 if (running) 3436 if (running)
3164 p->sched_class->set_curr_task(rq); 3437 p->sched_class->set_curr_task(rq);
@@ -3173,6 +3446,26 @@ recheck:
3173 return 0; 3446 return 0;
3174} 3447}
3175 3448
3449static int _sched_setscheduler(struct task_struct *p, int policy,
3450 const struct sched_param *param, bool check)
3451{
3452 struct sched_attr attr = {
3453 .sched_policy = policy,
3454 .sched_priority = param->sched_priority,
3455 .sched_nice = PRIO_TO_NICE(p->static_prio),
3456 };
3457
3458 /*
3459 * Fixup the legacy SCHED_RESET_ON_FORK hack
3460 */
3461 if (policy & SCHED_RESET_ON_FORK) {
3462 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3463 policy &= ~SCHED_RESET_ON_FORK;
3464 attr.sched_policy = policy;
3465 }
3466
3467 return __sched_setscheduler(p, &attr, check);
3468}
3176/** 3469/**
3177 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3470 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3178 * @p: the task in question. 3471 * @p: the task in question.
@@ -3186,10 +3479,16 @@ recheck:
3186int sched_setscheduler(struct task_struct *p, int policy, 3479int sched_setscheduler(struct task_struct *p, int policy,
3187 const struct sched_param *param) 3480 const struct sched_param *param)
3188{ 3481{
3189 return __sched_setscheduler(p, policy, param, true); 3482 return _sched_setscheduler(p, policy, param, true);
3190} 3483}
3191EXPORT_SYMBOL_GPL(sched_setscheduler); 3484EXPORT_SYMBOL_GPL(sched_setscheduler);
3192 3485
3486int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3487{
3488 return __sched_setscheduler(p, attr, true);
3489}
3490EXPORT_SYMBOL_GPL(sched_setattr);
3491
3193/** 3492/**
3194 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3493 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3195 * @p: the task in question. 3494 * @p: the task in question.
@@ -3206,7 +3505,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3206int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3505int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3207 const struct sched_param *param) 3506 const struct sched_param *param)
3208{ 3507{
3209 return __sched_setscheduler(p, policy, param, false); 3508 return _sched_setscheduler(p, policy, param, false);
3210} 3509}
3211 3510
3212static int 3511static int
@@ -3231,6 +3530,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3231 return retval; 3530 return retval;
3232} 3531}
3233 3532
3533/*
3534 * Mimics kernel/events/core.c perf_copy_attr().
3535 */
3536static int sched_copy_attr(struct sched_attr __user *uattr,
3537 struct sched_attr *attr)
3538{
3539 u32 size;
3540 int ret;
3541
3542 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3543 return -EFAULT;
3544
3545 /*
3546 * zero the full structure, so that a short copy will be nice.
3547 */
3548 memset(attr, 0, sizeof(*attr));
3549
3550 ret = get_user(size, &uattr->size);
3551 if (ret)
3552 return ret;
3553
3554 if (size > PAGE_SIZE) /* silly large */
3555 goto err_size;
3556
3557 if (!size) /* abi compat */
3558 size = SCHED_ATTR_SIZE_VER0;
3559
3560 if (size < SCHED_ATTR_SIZE_VER0)
3561 goto err_size;
3562
3563 /*
3564 * If we're handed a bigger struct than we know of,
3565 * ensure all the unknown bits are 0 - i.e. new
3566 * user-space does not rely on any kernel feature
3567 * extensions we dont know about yet.
3568 */
3569 if (size > sizeof(*attr)) {
3570 unsigned char __user *addr;
3571 unsigned char __user *end;
3572 unsigned char val;
3573
3574 addr = (void __user *)uattr + sizeof(*attr);
3575 end = (void __user *)uattr + size;
3576
3577 for (; addr < end; addr++) {
3578 ret = get_user(val, addr);
3579 if (ret)
3580 return ret;
3581 if (val)
3582 goto err_size;
3583 }
3584 size = sizeof(*attr);
3585 }
3586
3587 ret = copy_from_user(attr, uattr, size);
3588 if (ret)
3589 return -EFAULT;
3590
3591 /*
3592 * XXX: do we want to be lenient like existing syscalls; or do we want
3593 * to be strict and return an error on out-of-bounds values?
3594 */
3595 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3596
3597out:
3598 return ret;
3599
3600err_size:
3601 put_user(sizeof(*attr), &uattr->size);
3602 ret = -E2BIG;
3603 goto out;
3604}
3605
3234/** 3606/**
3235 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3607 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3236 * @pid: the pid in question. 3608 * @pid: the pid in question.
@@ -3262,6 +3634,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3262} 3634}
3263 3635
3264/** 3636/**
3637 * sys_sched_setattr - same as above, but with extended sched_attr
3638 * @pid: the pid in question.
3639 * @uattr: structure containing the extended parameters.
3640 */
3641SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
3642{
3643 struct sched_attr attr;
3644 struct task_struct *p;
3645 int retval;
3646
3647 if (!uattr || pid < 0)
3648 return -EINVAL;
3649
3650 if (sched_copy_attr(uattr, &attr))
3651 return -EFAULT;
3652
3653 rcu_read_lock();
3654 retval = -ESRCH;
3655 p = find_process_by_pid(pid);
3656 if (p != NULL)
3657 retval = sched_setattr(p, &attr);
3658 rcu_read_unlock();
3659
3660 return retval;
3661}
3662
3663/**
3265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3664 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3266 * @pid: the pid in question. 3665 * @pid: the pid in question.
3267 * 3666 *
@@ -3316,6 +3715,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3316 if (retval) 3715 if (retval)
3317 goto out_unlock; 3716 goto out_unlock;
3318 3717
3718 if (task_has_dl_policy(p)) {
3719 retval = -EINVAL;
3720 goto out_unlock;
3721 }
3319 lp.sched_priority = p->rt_priority; 3722 lp.sched_priority = p->rt_priority;
3320 rcu_read_unlock(); 3723 rcu_read_unlock();
3321 3724
@@ -3331,6 +3734,96 @@ out_unlock:
3331 return retval; 3734 return retval;
3332} 3735}
3333 3736
3737static int sched_read_attr(struct sched_attr __user *uattr,
3738 struct sched_attr *attr,
3739 unsigned int usize)
3740{
3741 int ret;
3742
3743 if (!access_ok(VERIFY_WRITE, uattr, usize))
3744 return -EFAULT;
3745
3746 /*
3747 * If we're handed a smaller struct than we know of,
3748 * ensure all the unknown bits are 0 - i.e. old
3749 * user-space does not get uncomplete information.
3750 */
3751 if (usize < sizeof(*attr)) {
3752 unsigned char *addr;
3753 unsigned char *end;
3754
3755 addr = (void *)attr + usize;
3756 end = (void *)attr + sizeof(*attr);
3757
3758 for (; addr < end; addr++) {
3759 if (*addr)
3760 goto err_size;
3761 }
3762
3763 attr->size = usize;
3764 }
3765
3766 ret = copy_to_user(uattr, attr, usize);
3767 if (ret)
3768 return -EFAULT;
3769
3770out:
3771 return ret;
3772
3773err_size:
3774 ret = -E2BIG;
3775 goto out;
3776}
3777
3778/**
3779 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3780 * @pid: the pid in question.
3781 * @uattr: structure containing the extended parameters.
3782 * @size: sizeof(attr) for fwd/bwd comp.
3783 */
3784SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3785 unsigned int, size)
3786{
3787 struct sched_attr attr = {
3788 .size = sizeof(struct sched_attr),
3789 };
3790 struct task_struct *p;
3791 int retval;
3792
3793 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3794 size < SCHED_ATTR_SIZE_VER0)
3795 return -EINVAL;
3796
3797 rcu_read_lock();
3798 p = find_process_by_pid(pid);
3799 retval = -ESRCH;
3800 if (!p)
3801 goto out_unlock;
3802
3803 retval = security_task_getscheduler(p);
3804 if (retval)
3805 goto out_unlock;
3806
3807 attr.sched_policy = p->policy;
3808 if (p->sched_reset_on_fork)
3809 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3810 if (task_has_dl_policy(p))
3811 __getparam_dl(p, &attr);
3812 else if (task_has_rt_policy(p))
3813 attr.sched_priority = p->rt_priority;
3814 else
3815 attr.sched_nice = TASK_NICE(p);
3816
3817 rcu_read_unlock();
3818
3819 retval = sched_read_attr(uattr, &attr, size);
3820 return retval;
3821
3822out_unlock:
3823 rcu_read_unlock();
3824 return retval;
3825}
3826
3334long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3827long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3335{ 3828{
3336 cpumask_var_t cpus_allowed, new_mask; 3829 cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3868,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3375 if (retval) 3868 if (retval)
3376 goto out_unlock; 3869 goto out_unlock;
3377 3870
3871
3378 cpuset_cpus_allowed(p, cpus_allowed); 3872 cpuset_cpus_allowed(p, cpus_allowed);
3379 cpumask_and(new_mask, in_mask, cpus_allowed); 3873 cpumask_and(new_mask, in_mask, cpus_allowed);
3874
3875 /*
3876 * Since bandwidth control happens on root_domain basis,
3877 * if admission test is enabled, we only admit -deadline
3878 * tasks allowed to run on all the CPUs in the task's
3879 * root_domain.
3880 */
3881#ifdef CONFIG_SMP
3882 if (task_has_dl_policy(p)) {
3883 const struct cpumask *span = task_rq(p)->rd->span;
3884
3885 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3886 retval = -EBUSY;
3887 goto out_unlock;
3888 }
3889 }
3890#endif
3380again: 3891again:
3381 retval = set_cpus_allowed_ptr(p, new_mask); 3892 retval = set_cpus_allowed_ptr(p, new_mask);
3382 3893
@@ -3653,7 +4164,7 @@ again:
3653 } 4164 }
3654 4165
3655 double_rq_lock(rq, p_rq); 4166 double_rq_lock(rq, p_rq);
3656 while (task_rq(p) != p_rq) { 4167 if (task_rq(p) != p_rq) {
3657 double_rq_unlock(rq, p_rq); 4168 double_rq_unlock(rq, p_rq);
3658 goto again; 4169 goto again;
3659 } 4170 }
@@ -3742,6 +4253,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3742 case SCHED_RR: 4253 case SCHED_RR:
3743 ret = MAX_USER_RT_PRIO-1; 4254 ret = MAX_USER_RT_PRIO-1;
3744 break; 4255 break;
4256 case SCHED_DEADLINE:
3745 case SCHED_NORMAL: 4257 case SCHED_NORMAL:
3746 case SCHED_BATCH: 4258 case SCHED_BATCH:
3747 case SCHED_IDLE: 4259 case SCHED_IDLE:
@@ -3768,6 +4280,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3768 case SCHED_RR: 4280 case SCHED_RR:
3769 ret = 1; 4281 ret = 1;
3770 break; 4282 break;
4283 case SCHED_DEADLINE:
3771 case SCHED_NORMAL: 4284 case SCHED_NORMAL:
3772 case SCHED_BATCH: 4285 case SCHED_BATCH:
3773 case SCHED_IDLE: 4286 case SCHED_IDLE:
@@ -4514,13 +5027,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
4514static int sched_cpu_inactive(struct notifier_block *nfb, 5027static int sched_cpu_inactive(struct notifier_block *nfb,
4515 unsigned long action, void *hcpu) 5028 unsigned long action, void *hcpu)
4516{ 5029{
5030 unsigned long flags;
5031 long cpu = (long)hcpu;
5032
4517 switch (action & ~CPU_TASKS_FROZEN) { 5033 switch (action & ~CPU_TASKS_FROZEN) {
4518 case CPU_DOWN_PREPARE: 5034 case CPU_DOWN_PREPARE:
4519 set_cpu_active((long)hcpu, false); 5035 set_cpu_active(cpu, false);
5036
5037 /* explicitly allow suspend */
5038 if (!(action & CPU_TASKS_FROZEN)) {
5039 struct dl_bw *dl_b = dl_bw_of(cpu);
5040 bool overflow;
5041 int cpus;
5042
5043 raw_spin_lock_irqsave(&dl_b->lock, flags);
5044 cpus = dl_bw_cpus(cpu);
5045 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5046 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5047
5048 if (overflow)
5049 return notifier_from_errno(-EBUSY);
5050 }
4520 return NOTIFY_OK; 5051 return NOTIFY_OK;
4521 default:
4522 return NOTIFY_DONE;
4523 } 5052 }
5053
5054 return NOTIFY_DONE;
4524} 5055}
4525 5056
4526static int __init migration_init(void) 5057static int __init migration_init(void)
@@ -4739,6 +5270,8 @@ static void free_rootdomain(struct rcu_head *rcu)
4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5270 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4740 5271
4741 cpupri_cleanup(&rd->cpupri); 5272 cpupri_cleanup(&rd->cpupri);
5273 cpudl_cleanup(&rd->cpudl);
5274 free_cpumask_var(rd->dlo_mask);
4742 free_cpumask_var(rd->rto_mask); 5275 free_cpumask_var(rd->rto_mask);
4743 free_cpumask_var(rd->online); 5276 free_cpumask_var(rd->online);
4744 free_cpumask_var(rd->span); 5277 free_cpumask_var(rd->span);
@@ -4790,8 +5323,14 @@ static int init_rootdomain(struct root_domain *rd)
4790 goto out; 5323 goto out;
4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5324 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4792 goto free_span; 5325 goto free_span;
4793 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5326 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
4794 goto free_online; 5327 goto free_online;
5328 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5329 goto free_dlo_mask;
5330
5331 init_dl_bw(&rd->dl_bw);
5332 if (cpudl_init(&rd->cpudl) != 0)
5333 goto free_dlo_mask;
4795 5334
4796 if (cpupri_init(&rd->cpupri) != 0) 5335 if (cpupri_init(&rd->cpupri) != 0)
4797 goto free_rto_mask; 5336 goto free_rto_mask;
@@ -4799,6 +5338,8 @@ static int init_rootdomain(struct root_domain *rd)
4799 5338
4800free_rto_mask: 5339free_rto_mask:
4801 free_cpumask_var(rd->rto_mask); 5340 free_cpumask_var(rd->rto_mask);
5341free_dlo_mask:
5342 free_cpumask_var(rd->dlo_mask);
4802free_online: 5343free_online:
4803 free_cpumask_var(rd->online); 5344 free_cpumask_var(rd->online);
4804free_span: 5345free_span:
@@ -6150,6 +6691,7 @@ void __init sched_init_smp(void)
6150 free_cpumask_var(non_isolated_cpus); 6691 free_cpumask_var(non_isolated_cpus);
6151 6692
6152 init_sched_rt_class(); 6693 init_sched_rt_class();
6694 init_sched_dl_class();
6153} 6695}
6154#else 6696#else
6155void __init sched_init_smp(void) 6697void __init sched_init_smp(void)
@@ -6219,13 +6761,15 @@ void __init sched_init(void)
6219#endif /* CONFIG_CPUMASK_OFFSTACK */ 6761#endif /* CONFIG_CPUMASK_OFFSTACK */
6220 } 6762 }
6221 6763
6764 init_rt_bandwidth(&def_rt_bandwidth,
6765 global_rt_period(), global_rt_runtime());
6766 init_dl_bandwidth(&def_dl_bandwidth,
6767 global_rt_period(), global_rt_runtime());
6768
6222#ifdef CONFIG_SMP 6769#ifdef CONFIG_SMP
6223 init_defrootdomain(); 6770 init_defrootdomain();
6224#endif 6771#endif
6225 6772
6226 init_rt_bandwidth(&def_rt_bandwidth,
6227 global_rt_period(), global_rt_runtime());
6228
6229#ifdef CONFIG_RT_GROUP_SCHED 6773#ifdef CONFIG_RT_GROUP_SCHED
6230 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6774 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6231 global_rt_period(), global_rt_runtime()); 6775 global_rt_period(), global_rt_runtime());
@@ -6249,6 +6793,7 @@ void __init sched_init(void)
6249 rq->calc_load_update = jiffies + LOAD_FREQ; 6793 rq->calc_load_update = jiffies + LOAD_FREQ;
6250 init_cfs_rq(&rq->cfs); 6794 init_cfs_rq(&rq->cfs);
6251 init_rt_rq(&rq->rt, rq); 6795 init_rt_rq(&rq->rt, rq);
6796 init_dl_rq(&rq->dl, rq);
6252#ifdef CONFIG_FAIR_GROUP_SCHED 6797#ifdef CONFIG_FAIR_GROUP_SCHED
6253 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6798 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6799 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6865,6 @@ void __init sched_init(void)
6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6865 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6321#endif 6866#endif
6322 6867
6323#ifdef CONFIG_RT_MUTEXES
6324 plist_head_init(&init_task.pi_waiters);
6325#endif
6326
6327 /* 6868 /*
6328 * The boot idle thread does lazy MMU switching as well: 6869 * The boot idle thread does lazy MMU switching as well:
6329 */ 6870 */
@@ -6397,13 +6938,16 @@ EXPORT_SYMBOL(__might_sleep);
6397static void normalize_task(struct rq *rq, struct task_struct *p) 6938static void normalize_task(struct rq *rq, struct task_struct *p)
6398{ 6939{
6399 const struct sched_class *prev_class = p->sched_class; 6940 const struct sched_class *prev_class = p->sched_class;
6941 struct sched_attr attr = {
6942 .sched_policy = SCHED_NORMAL,
6943 };
6400 int old_prio = p->prio; 6944 int old_prio = p->prio;
6401 int on_rq; 6945 int on_rq;
6402 6946
6403 on_rq = p->on_rq; 6947 on_rq = p->on_rq;
6404 if (on_rq) 6948 if (on_rq)
6405 dequeue_task(rq, p, 0); 6949 dequeue_task(rq, p, 0);
6406 __setscheduler(rq, p, SCHED_NORMAL, 0); 6950 __setscheduler(rq, p, &attr);
6407 if (on_rq) { 6951 if (on_rq) {
6408 enqueue_task(rq, p, 0); 6952 enqueue_task(rq, p, 0);
6409 resched_task(rq->curr); 6953 resched_task(rq->curr);
@@ -6433,7 +6977,7 @@ void normalize_rt_tasks(void)
6433 p->se.statistics.block_start = 0; 6977 p->se.statistics.block_start = 0;
6434#endif 6978#endif
6435 6979
6436 if (!rt_task(p)) { 6980 if (!dl_task(p) && !rt_task(p)) {
6437 /* 6981 /*
6438 * Renice negative nice level userspace 6982 * Renice negative nice level userspace
6439 * tasks back to 0: 6983 * tasks back to 0:
@@ -6628,16 +7172,6 @@ void sched_move_task(struct task_struct *tsk)
6628} 7172}
6629#endif /* CONFIG_CGROUP_SCHED */ 7173#endif /* CONFIG_CGROUP_SCHED */
6630 7174
6631#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6632static unsigned long to_ratio(u64 period, u64 runtime)
6633{
6634 if (runtime == RUNTIME_INF)
6635 return 1ULL << 20;
6636
6637 return div64_u64(runtime << 20, period);
6638}
6639#endif
6640
6641#ifdef CONFIG_RT_GROUP_SCHED 7175#ifdef CONFIG_RT_GROUP_SCHED
6642/* 7176/*
6643 * Ensure that the real time constraints are schedulable. 7177 * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7345,13 @@ static long sched_group_rt_period(struct task_group *tg)
6811 do_div(rt_period_us, NSEC_PER_USEC); 7345 do_div(rt_period_us, NSEC_PER_USEC);
6812 return rt_period_us; 7346 return rt_period_us;
6813} 7347}
7348#endif /* CONFIG_RT_GROUP_SCHED */
6814 7349
7350#ifdef CONFIG_RT_GROUP_SCHED
6815static int sched_rt_global_constraints(void) 7351static int sched_rt_global_constraints(void)
6816{ 7352{
6817 u64 runtime, period;
6818 int ret = 0; 7353 int ret = 0;
6819 7354
6820 if (sysctl_sched_rt_period <= 0)
6821 return -EINVAL;
6822
6823 runtime = global_rt_runtime();
6824 period = global_rt_period();
6825
6826 /*
6827 * Sanity check on the sysctl variables.
6828 */
6829 if (runtime > period && runtime != RUNTIME_INF)
6830 return -EINVAL;
6831
6832 mutex_lock(&rt_constraints_mutex); 7355 mutex_lock(&rt_constraints_mutex);
6833 read_lock(&tasklist_lock); 7356 read_lock(&tasklist_lock);
6834 ret = __rt_schedulable(NULL, 0, 0); 7357 ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7374,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6851static int sched_rt_global_constraints(void) 7374static int sched_rt_global_constraints(void)
6852{ 7375{
6853 unsigned long flags; 7376 unsigned long flags;
6854 int i; 7377 int i, ret = 0;
6855
6856 if (sysctl_sched_rt_period <= 0)
6857 return -EINVAL;
6858
6859 /*
6860 * There's always some RT tasks in the root group
6861 * -- migration, kstopmachine etc..
6862 */
6863 if (sysctl_sched_rt_runtime == 0)
6864 return -EBUSY;
6865 7378
6866 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7379 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6867 for_each_possible_cpu(i) { 7380 for_each_possible_cpu(i) {
@@ -6873,36 +7386,88 @@ static int sched_rt_global_constraints(void)
6873 } 7386 }
6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7387 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6875 7388
6876 return 0; 7389 return ret;
6877} 7390}
6878#endif /* CONFIG_RT_GROUP_SCHED */ 7391#endif /* CONFIG_RT_GROUP_SCHED */
6879 7392
6880int sched_rr_handler(struct ctl_table *table, int write, 7393static int sched_dl_global_constraints(void)
6881 void __user *buffer, size_t *lenp,
6882 loff_t *ppos)
6883{ 7394{
6884 int ret; 7395 u64 runtime = global_rt_runtime();
6885 static DEFINE_MUTEX(mutex); 7396 u64 period = global_rt_period();
7397 u64 new_bw = to_ratio(period, runtime);
7398 int cpu, ret = 0;
6886 7399
6887 mutex_lock(&mutex); 7400 /*
6888 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7401 * Here we want to check the bandwidth not being set to some
6889 /* make sure that internally we keep jiffies */ 7402 * value smaller than the currently allocated bandwidth in
6890 /* also, writing zero resets timeslice to default */ 7403 * any of the root_domains.
6891 if (!ret && write) { 7404 *
6892 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7405 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7406 * cycling on root_domains... Discussion on different/better
7407 * solutions is welcome!
7408 */
7409 for_each_possible_cpu(cpu) {
7410 struct dl_bw *dl_b = dl_bw_of(cpu);
7411
7412 raw_spin_lock(&dl_b->lock);
7413 if (new_bw < dl_b->total_bw)
7414 ret = -EBUSY;
7415 raw_spin_unlock(&dl_b->lock);
7416
7417 if (ret)
7418 break;
6894 } 7419 }
6895 mutex_unlock(&mutex); 7420
6896 return ret; 7421 return ret;
6897} 7422}
6898 7423
7424static void sched_dl_do_global(void)
7425{
7426 u64 new_bw = -1;
7427 int cpu;
7428
7429 def_dl_bandwidth.dl_period = global_rt_period();
7430 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7431
7432 if (global_rt_runtime() != RUNTIME_INF)
7433 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7434
7435 /*
7436 * FIXME: As above...
7437 */
7438 for_each_possible_cpu(cpu) {
7439 struct dl_bw *dl_b = dl_bw_of(cpu);
7440
7441 raw_spin_lock(&dl_b->lock);
7442 dl_b->bw = new_bw;
7443 raw_spin_unlock(&dl_b->lock);
7444 }
7445}
7446
7447static int sched_rt_global_validate(void)
7448{
7449 if (sysctl_sched_rt_period <= 0)
7450 return -EINVAL;
7451
7452 if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
7453 return -EINVAL;
7454
7455 return 0;
7456}
7457
7458static void sched_rt_do_global(void)
7459{
7460 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7461 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7462}
7463
6899int sched_rt_handler(struct ctl_table *table, int write, 7464int sched_rt_handler(struct ctl_table *table, int write,
6900 void __user *buffer, size_t *lenp, 7465 void __user *buffer, size_t *lenp,
6901 loff_t *ppos) 7466 loff_t *ppos)
6902{ 7467{
6903 int ret;
6904 int old_period, old_runtime; 7468 int old_period, old_runtime;
6905 static DEFINE_MUTEX(mutex); 7469 static DEFINE_MUTEX(mutex);
7470 int ret;
6906 7471
6907 mutex_lock(&mutex); 7472 mutex_lock(&mutex);
6908 old_period = sysctl_sched_rt_period; 7473 old_period = sysctl_sched_rt_period;
@@ -6911,21 +7476,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
6911 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7476 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6912 7477
6913 if (!ret && write) { 7478 if (!ret && write) {
7479 ret = sched_rt_global_validate();
7480 if (ret)
7481 goto undo;
7482
6914 ret = sched_rt_global_constraints(); 7483 ret = sched_rt_global_constraints();
6915 if (ret) { 7484 if (ret)
6916 sysctl_sched_rt_period = old_period; 7485 goto undo;
6917 sysctl_sched_rt_runtime = old_runtime; 7486
6918 } else { 7487 ret = sched_dl_global_constraints();
6919 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7488 if (ret)
6920 def_rt_bandwidth.rt_period = 7489 goto undo;
6921 ns_to_ktime(global_rt_period()); 7490
6922 } 7491 sched_rt_do_global();
7492 sched_dl_do_global();
7493 }
7494 if (0) {
7495undo:
7496 sysctl_sched_rt_period = old_period;
7497 sysctl_sched_rt_runtime = old_runtime;
6923 } 7498 }
6924 mutex_unlock(&mutex); 7499 mutex_unlock(&mutex);
6925 7500
6926 return ret; 7501 return ret;
6927} 7502}
6928 7503
7504int sched_rr_handler(struct ctl_table *table, int write,
7505 void __user *buffer, size_t *lenp,
7506 loff_t *ppos)
7507{
7508 int ret;
7509 static DEFINE_MUTEX(mutex);
7510
7511 mutex_lock(&mutex);
7512 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7513 /* make sure that internally we keep jiffies */
7514 /* also, writing zero resets timeslice to default */
7515 if (!ret && write) {
7516 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7517 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7518 }
7519 mutex_unlock(&mutex);
7520 return ret;
7521}
7522
6929#ifdef CONFIG_CGROUP_SCHED 7523#ifdef CONFIG_CGROUP_SCHED
6930 7524
6931static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7525static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..045fc74e3f09
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
1/*
2 * kernel/sched/cpudl.c
3 *
4 * Global CPU deadline management
5 *
6 * Author: Juri Lelli <j.lelli@sssup.it>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2
11 * of the License.
12 */
13
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include "cpudeadline.h"
17
18static inline int parent(int i)
19{
20 return (i - 1) >> 1;
21}
22
23static inline int left_child(int i)
24{
25 return (i << 1) + 1;
26}
27
28static inline int right_child(int i)
29{
30 return (i << 1) + 2;
31}
32
33static inline int dl_time_before(u64 a, u64 b)
34{
35 return (s64)(a - b) < 0;
36}
37
38static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41
42 swap(cp->elements[a], cp->elements[b]);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
44}
45
46static void cpudl_heapify(struct cpudl *cp, int idx)
47{
48 int l, r, largest;
49
50 /* adapted from lib/prio_heap.c */
51 while(1) {
52 l = left_child(idx);
53 r = right_child(idx);
54 largest = idx;
55
56 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
57 cp->elements[l].dl))
58 largest = l;
59 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
60 cp->elements[r].dl))
61 largest = r;
62 if (largest == idx)
63 break;
64
65 /* Push idx down the heap one level and bump one up */
66 cpudl_exchange(cp, largest, idx);
67 idx = largest;
68 }
69}
70
71static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72{
73 WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
74
75 if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76 cp->elements[idx].dl = new_dl;
77 cpudl_heapify(cp, idx);
78 } else {
79 cp->elements[idx].dl = new_dl;
80 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
81 cp->elements[idx].dl)) {
82 cpudl_exchange(cp, idx, parent(idx));
83 idx = parent(idx);
84 }
85 }
86}
87
88static inline int cpudl_maximum(struct cpudl *cp)
89{
90 return cp->elements[0].cpu;
91}
92
93/*
94 * cpudl_find - find the best (later-dl) CPU in the system
95 * @cp: the cpudl max-heap context
96 * @p: the task
97 * @later_mask: a mask to fill in with the selected CPUs (or NULL)
98 *
99 * Returns: int - best CPU (heap maximum if suitable)
100 */
101int cpudl_find(struct cpudl *cp, struct task_struct *p,
102 struct cpumask *later_mask)
103{
104 int best_cpu = -1;
105 const struct sched_dl_entity *dl_se = &p->dl;
106
107 if (later_mask && cpumask_and(later_mask, cp->free_cpus,
108 &p->cpus_allowed) && cpumask_and(later_mask,
109 later_mask, cpu_active_mask)) {
110 best_cpu = cpumask_any(later_mask);
111 goto out;
112 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
113 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
114 best_cpu = cpudl_maximum(cp);
115 if (later_mask)
116 cpumask_set_cpu(best_cpu, later_mask);
117 }
118
119out:
120 WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
121
122 return best_cpu;
123}
124
125/*
126 * cpudl_set - update the cpudl max-heap
127 * @cp: the cpudl max-heap context
128 * @cpu: the target cpu
129 * @dl: the new earliest deadline for this cpu
130 *
131 * Notes: assumes cpu_rq(cpu)->lock is locked
132 *
133 * Returns: (void)
134 */
135void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
136{
137 int old_idx, new_cpu;
138 unsigned long flags;
139
140 WARN_ON(cpu > num_present_cpus());
141
142 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu];
144 if (!is_valid) {
145 /* remove item */
146 if (old_idx == IDX_INVALID) {
147 /*
148 * Nothing to remove if old_idx was invalid.
149 * This could happen if a rq_offline_dl is
150 * called for a CPU without -dl tasks running.
151 */
152 goto out;
153 }
154 new_cpu = cp->elements[cp->size - 1].cpu;
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) {
163 cpudl_exchange(cp, old_idx, parent(old_idx));
164 old_idx = parent(old_idx);
165 }
166 cpumask_set_cpu(cpu, cp->free_cpus);
167 cpudl_heapify(cp, old_idx);
168
169 goto out;
170 }
171
172 if (old_idx == IDX_INVALID) {
173 cp->size++;
174 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else {
180 cpudl_change_key(cp, old_idx, dl);
181 }
182
183out:
184 raw_spin_unlock_irqrestore(&cp->lock, flags);
185}
186
187/*
188 * cpudl_init - initialize the cpudl structure
189 * @cp: the cpudl max-heap context
190 */
191int cpudl_init(struct cpudl *cp)
192{
193 int i;
194
195 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock);
197 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++)
199 cp->cpu_to_idx[i] = IDX_INVALID;
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
201 return -ENOMEM;
202 cpumask_setall(cp->free_cpus);
203
204 return 0;
205}
206
207/*
208 * cpudl_cleanup - clean up the cpudl structure
209 * @cp: the cpudl max-heap context
210 */
211void cpudl_cleanup(struct cpudl *cp)
212{
213 /*
214 * nothing to do for the moment
215 */
216}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_CPUDL_H
2#define _LINUX_CPUDL_H
3
4#include <linux/sched.h>
5
6#define IDX_INVALID -1
7
8struct array_item {
9 u64 dl;
10 int cpu;
11};
12
13struct cpudl {
14 raw_spinlock_t lock;
15 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus;
19};
20
21
22#ifdef CONFIG_SMP
23int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */
32
33#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..0de248202879
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1640 @@
1/*
2 * Deadline Scheduling Class (SCHED_DEADLINE)
3 *
4 * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
5 *
6 * Tasks that periodically executes their instances for less than their
7 * runtime won't miss any of their deadlines.
8 * Tasks that are not periodic or sporadic or that tries to execute more
9 * than their reserved bandwidth will be slowed down (and may potentially
10 * miss some of their deadlines), and won't affect any other task.
11 *
12 * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
13 * Juri Lelli <juri.lelli@gmail.com>,
14 * Michael Trimarchi <michael@amarulasolutions.com>,
15 * Fabio Checconi <fchecconi@gmail.com>
16 */
17#include "sched.h"
18
19#include <linux/slab.h>
20
21struct dl_bandwidth def_dl_bandwidth;
22
23static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
24{
25 return container_of(dl_se, struct task_struct, dl);
26}
27
28static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
29{
30 return container_of(dl_rq, struct rq, dl);
31}
32
33static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
34{
35 struct task_struct *p = dl_task_of(dl_se);
36 struct rq *rq = task_rq(p);
37
38 return &rq->dl;
39}
40
41static inline int on_dl_rq(struct sched_dl_entity *dl_se)
42{
43 return !RB_EMPTY_NODE(&dl_se->rb_node);
44}
45
46static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
47{
48 struct sched_dl_entity *dl_se = &p->dl;
49
50 return dl_rq->rb_leftmost == &dl_se->rb_node;
51}
52
53void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
54{
55 raw_spin_lock_init(&dl_b->dl_runtime_lock);
56 dl_b->dl_period = period;
57 dl_b->dl_runtime = runtime;
58}
59
60extern unsigned long to_ratio(u64 period, u64 runtime);
61
62void init_dl_bw(struct dl_bw *dl_b)
63{
64 raw_spin_lock_init(&dl_b->lock);
65 raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
66 if (global_rt_runtime() == RUNTIME_INF)
67 dl_b->bw = -1;
68 else
69 dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
70 raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
71 dl_b->total_bw = 0;
72}
73
74void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
75{
76 dl_rq->rb_root = RB_ROOT;
77
78#ifdef CONFIG_SMP
79 /* zero means no -deadline tasks */
80 dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
81
82 dl_rq->dl_nr_migratory = 0;
83 dl_rq->overloaded = 0;
84 dl_rq->pushable_dl_tasks_root = RB_ROOT;
85#else
86 init_dl_bw(&dl_rq->dl_bw);
87#endif
88}
89
90#ifdef CONFIG_SMP
91
92static inline int dl_overloaded(struct rq *rq)
93{
94 return atomic_read(&rq->rd->dlo_count);
95}
96
97static inline void dl_set_overload(struct rq *rq)
98{
99 if (!rq->online)
100 return;
101
102 cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
103 /*
104 * Must be visible before the overload count is
105 * set (as in sched_rt.c).
106 *
107 * Matched by the barrier in pull_dl_task().
108 */
109 smp_wmb();
110 atomic_inc(&rq->rd->dlo_count);
111}
112
113static inline void dl_clear_overload(struct rq *rq)
114{
115 if (!rq->online)
116 return;
117
118 atomic_dec(&rq->rd->dlo_count);
119 cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
120}
121
122static void update_dl_migration(struct dl_rq *dl_rq)
123{
124 if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
125 if (!dl_rq->overloaded) {
126 dl_set_overload(rq_of_dl_rq(dl_rq));
127 dl_rq->overloaded = 1;
128 }
129 } else if (dl_rq->overloaded) {
130 dl_clear_overload(rq_of_dl_rq(dl_rq));
131 dl_rq->overloaded = 0;
132 }
133}
134
135static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
136{
137 struct task_struct *p = dl_task_of(dl_se);
138 dl_rq = &rq_of_dl_rq(dl_rq)->dl;
139
140 dl_rq->dl_nr_total++;
141 if (p->nr_cpus_allowed > 1)
142 dl_rq->dl_nr_migratory++;
143
144 update_dl_migration(dl_rq);
145}
146
147static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
148{
149 struct task_struct *p = dl_task_of(dl_se);
150 dl_rq = &rq_of_dl_rq(dl_rq)->dl;
151
152 dl_rq->dl_nr_total--;
153 if (p->nr_cpus_allowed > 1)
154 dl_rq->dl_nr_migratory--;
155
156 update_dl_migration(dl_rq);
157}
158
159/*
160 * The list of pushable -deadline task is not a plist, like in
161 * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
162 */
163static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
164{
165 struct dl_rq *dl_rq = &rq->dl;
166 struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
167 struct rb_node *parent = NULL;
168 struct task_struct *entry;
169 int leftmost = 1;
170
171 BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
172
173 while (*link) {
174 parent = *link;
175 entry = rb_entry(parent, struct task_struct,
176 pushable_dl_tasks);
177 if (dl_entity_preempt(&p->dl, &entry->dl))
178 link = &parent->rb_left;
179 else {
180 link = &parent->rb_right;
181 leftmost = 0;
182 }
183 }
184
185 if (leftmost)
186 dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
187
188 rb_link_node(&p->pushable_dl_tasks, parent, link);
189 rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
190}
191
192static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
193{
194 struct dl_rq *dl_rq = &rq->dl;
195
196 if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
197 return;
198
199 if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
200 struct rb_node *next_node;
201
202 next_node = rb_next(&p->pushable_dl_tasks);
203 dl_rq->pushable_dl_tasks_leftmost = next_node;
204 }
205
206 rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
207 RB_CLEAR_NODE(&p->pushable_dl_tasks);
208}
209
210static inline int has_pushable_dl_tasks(struct rq *rq)
211{
212 return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
213}
214
215static int push_dl_task(struct rq *rq);
216
217#else
218
219static inline
220void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
221{
222}
223
224static inline
225void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
226{
227}
228
229static inline
230void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
231{
232}
233
234static inline
235void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
236{
237}
238
239#endif /* CONFIG_SMP */
240
241static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
242static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
243static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
244 int flags);
245
246/*
247 * We are being explicitly informed that a new instance is starting,
248 * and this means that:
249 * - the absolute deadline of the entity has to be placed at
250 * current time + relative deadline;
251 * - the runtime of the entity has to be set to the maximum value.
252 *
253 * The capability of specifying such event is useful whenever a -deadline
254 * entity wants to (try to!) synchronize its behaviour with the scheduler's
255 * one, and to (try to!) reconcile itself with its own scheduling
256 * parameters.
257 */
258static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
259 struct sched_dl_entity *pi_se)
260{
261 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
262 struct rq *rq = rq_of_dl_rq(dl_rq);
263
264 WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
265
266 /*
267 * We use the regular wall clock time to set deadlines in the
268 * future; in fact, we must consider execution overheads (time
269 * spent on hardirq context, etc.).
270 */
271 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
272 dl_se->runtime = pi_se->dl_runtime;
273 dl_se->dl_new = 0;
274}
275
276/*
277 * Pure Earliest Deadline First (EDF) scheduling does not deal with the
278 * possibility of a entity lasting more than what it declared, and thus
279 * exhausting its runtime.
280 *
281 * Here we are interested in making runtime overrun possible, but we do
282 * not want a entity which is misbehaving to affect the scheduling of all
283 * other entities.
284 * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
285 * is used, in order to confine each entity within its own bandwidth.
286 *
287 * This function deals exactly with that, and ensures that when the runtime
288 * of a entity is replenished, its deadline is also postponed. That ensures
289 * the overrunning entity can't interfere with other entity in the system and
290 * can't make them miss their deadlines. Reasons why this kind of overruns
291 * could happen are, typically, a entity voluntarily trying to overcome its
292 * runtime, or it just underestimated it during sched_setscheduler_ex().
293 */
294static void replenish_dl_entity(struct sched_dl_entity *dl_se,
295 struct sched_dl_entity *pi_se)
296{
297 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
298 struct rq *rq = rq_of_dl_rq(dl_rq);
299
300 BUG_ON(pi_se->dl_runtime <= 0);
301
302 /*
303 * This could be the case for a !-dl task that is boosted.
304 * Just go with full inherited parameters.
305 */
306 if (dl_se->dl_deadline == 0) {
307 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
308 dl_se->runtime = pi_se->dl_runtime;
309 }
310
311 /*
312 * We keep moving the deadline away until we get some
313 * available runtime for the entity. This ensures correct
314 * handling of situations where the runtime overrun is
315 * arbitrary large.
316 */
317 while (dl_se->runtime <= 0) {
318 dl_se->deadline += pi_se->dl_period;
319 dl_se->runtime += pi_se->dl_runtime;
320 }
321
322 /*
323 * At this point, the deadline really should be "in
324 * the future" with respect to rq->clock. If it's
325 * not, we are, for some reason, lagging too much!
326 * Anyway, after having warn userspace abut that,
327 * we still try to keep the things running by
328 * resetting the deadline and the budget of the
329 * entity.
330 */
331 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
332 static bool lag_once = false;
333
334 if (!lag_once) {
335 lag_once = true;
336 printk_sched("sched: DL replenish lagged to much\n");
337 }
338 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
339 dl_se->runtime = pi_se->dl_runtime;
340 }
341}
342
343/*
344 * Here we check if --at time t-- an entity (which is probably being
345 * [re]activated or, in general, enqueued) can use its remaining runtime
346 * and its current deadline _without_ exceeding the bandwidth it is
347 * assigned (function returns true if it can't). We are in fact applying
348 * one of the CBS rules: when a task wakes up, if the residual runtime
349 * over residual deadline fits within the allocated bandwidth, then we
350 * can keep the current (absolute) deadline and residual budget without
351 * disrupting the schedulability of the system. Otherwise, we should
352 * refill the runtime and set the deadline a period in the future,
353 * because keeping the current (absolute) deadline of the task would
354 * result in breaking guarantees promised to other tasks.
355 *
356 * This function returns true if:
357 *
358 * runtime / (deadline - t) > dl_runtime / dl_period ,
359 *
360 * IOW we can't recycle current parameters.
361 *
362 * Notice that the bandwidth check is done against the period. For
363 * task with deadline equal to period this is the same of using
364 * dl_deadline instead of dl_period in the equation above.
365 */
366static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
367 struct sched_dl_entity *pi_se, u64 t)
368{
369 u64 left, right;
370
371 /*
372 * left and right are the two sides of the equation above,
373 * after a bit of shuffling to use multiplications instead
374 * of divisions.
375 *
376 * Note that none of the time values involved in the two
377 * multiplications are absolute: dl_deadline and dl_runtime
378 * are the relative deadline and the maximum runtime of each
379 * instance, runtime is the runtime left for the last instance
380 * and (deadline - t), since t is rq->clock, is the time left
381 * to the (absolute) deadline. Even if overflowing the u64 type
382 * is very unlikely to occur in both cases, here we scale down
383 * as we want to avoid that risk at all. Scaling down by 10
384 * means that we reduce granularity to 1us. We are fine with it,
385 * since this is only a true/false check and, anyway, thinking
386 * of anything below microseconds resolution is actually fiction
387 * (but still we want to give the user that illusion >;).
388 */
389 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
390 right = ((dl_se->deadline - t) >> DL_SCALE) *
391 (pi_se->dl_runtime >> DL_SCALE);
392
393 return dl_time_before(right, left);
394}
395
396/*
397 * When a -deadline entity is queued back on the runqueue, its runtime and
398 * deadline might need updating.
399 *
400 * The policy here is that we update the deadline of the entity only if:
401 * - the current deadline is in the past,
402 * - using the remaining runtime with the current deadline would make
403 * the entity exceed its bandwidth.
404 */
405static void update_dl_entity(struct sched_dl_entity *dl_se,
406 struct sched_dl_entity *pi_se)
407{
408 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
409 struct rq *rq = rq_of_dl_rq(dl_rq);
410
411 /*
412 * The arrival of a new instance needs special treatment, i.e.,
413 * the actual scheduling parameters have to be "renewed".
414 */
415 if (dl_se->dl_new) {
416 setup_new_dl_entity(dl_se, pi_se);
417 return;
418 }
419
420 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
421 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
422 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
423 dl_se->runtime = pi_se->dl_runtime;
424 }
425}
426
427/*
428 * If the entity depleted all its runtime, and if we want it to sleep
429 * while waiting for some new execution time to become available, we
430 * set the bandwidth enforcement timer to the replenishment instant
431 * and try to activate it.
432 *
433 * Notice that it is important for the caller to know if the timer
434 * actually started or not (i.e., the replenishment instant is in
435 * the future or in the past).
436 */
437static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
438{
439 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
440 struct rq *rq = rq_of_dl_rq(dl_rq);
441 ktime_t now, act;
442 ktime_t soft, hard;
443 unsigned long range;
444 s64 delta;
445
446 if (boosted)
447 return 0;
448 /*
449 * We want the timer to fire at the deadline, but considering
450 * that it is actually coming from rq->clock and not from
451 * hrtimer's time base reading.
452 */
453 act = ns_to_ktime(dl_se->deadline);
454 now = hrtimer_cb_get_time(&dl_se->dl_timer);
455 delta = ktime_to_ns(now) - rq_clock(rq);
456 act = ktime_add_ns(act, delta);
457
458 /*
459 * If the expiry time already passed, e.g., because the value
460 * chosen as the deadline is too small, don't even try to
461 * start the timer in the past!
462 */
463 if (ktime_us_delta(act, now) < 0)
464 return 0;
465
466 hrtimer_set_expires(&dl_se->dl_timer, act);
467
468 soft = hrtimer_get_softexpires(&dl_se->dl_timer);
469 hard = hrtimer_get_expires(&dl_se->dl_timer);
470 range = ktime_to_ns(ktime_sub(hard, soft));
471 __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
472 range, HRTIMER_MODE_ABS, 0);
473
474 return hrtimer_active(&dl_se->dl_timer);
475}
476
477/*
478 * This is the bandwidth enforcement timer callback. If here, we know
479 * a task is not on its dl_rq, since the fact that the timer was running
480 * means the task is throttled and needs a runtime replenishment.
481 *
482 * However, what we actually do depends on the fact the task is active,
483 * (it is on its rq) or has been removed from there by a call to
484 * dequeue_task_dl(). In the former case we must issue the runtime
485 * replenishment and add the task back to the dl_rq; in the latter, we just
486 * do nothing but clearing dl_throttled, so that runtime and deadline
487 * updating (and the queueing back to dl_rq) will be done by the
488 * next call to enqueue_task_dl().
489 */
490static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
491{
492 struct sched_dl_entity *dl_se = container_of(timer,
493 struct sched_dl_entity,
494 dl_timer);
495 struct task_struct *p = dl_task_of(dl_se);
496 struct rq *rq = task_rq(p);
497 raw_spin_lock(&rq->lock);
498
499 /*
500 * We need to take care of a possible races here. In fact, the
501 * task might have changed its scheduling policy to something
502 * different from SCHED_DEADLINE or changed its reservation
503 * parameters (through sched_setscheduler()).
504 */
505 if (!dl_task(p) || dl_se->dl_new)
506 goto unlock;
507
508 sched_clock_tick();
509 update_rq_clock(rq);
510 dl_se->dl_throttled = 0;
511 if (p->on_rq) {
512 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
513 if (task_has_dl_policy(rq->curr))
514 check_preempt_curr_dl(rq, p, 0);
515 else
516 resched_task(rq->curr);
517#ifdef CONFIG_SMP
518 /*
519 * Queueing this task back might have overloaded rq,
520 * check if we need to kick someone away.
521 */
522 if (has_pushable_dl_tasks(rq))
523 push_dl_task(rq);
524#endif
525 }
526unlock:
527 raw_spin_unlock(&rq->lock);
528
529 return HRTIMER_NORESTART;
530}
531
532void init_dl_task_timer(struct sched_dl_entity *dl_se)
533{
534 struct hrtimer *timer = &dl_se->dl_timer;
535
536 if (hrtimer_active(timer)) {
537 hrtimer_try_to_cancel(timer);
538 return;
539 }
540
541 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
542 timer->function = dl_task_timer;
543}
544
545static
546int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
547{
548 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
549 int rorun = dl_se->runtime <= 0;
550
551 if (!rorun && !dmiss)
552 return 0;
553
554 /*
555 * If we are beyond our current deadline and we are still
556 * executing, then we have already used some of the runtime of
557 * the next instance. Thus, if we do not account that, we are
558 * stealing bandwidth from the system at each deadline miss!
559 */
560 if (dmiss) {
561 dl_se->runtime = rorun ? dl_se->runtime : 0;
562 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
563 }
564
565 return 1;
566}
567
568/*
569 * Update the current task's runtime statistics (provided it is still
570 * a -deadline task and has not been removed from the dl_rq).
571 */
572static void update_curr_dl(struct rq *rq)
573{
574 struct task_struct *curr = rq->curr;
575 struct sched_dl_entity *dl_se = &curr->dl;
576 u64 delta_exec;
577
578 if (!dl_task(curr) || !on_dl_rq(dl_se))
579 return;
580
581 /*
582 * Consumed budget is computed considering the time as
583 * observed by schedulable tasks (excluding time spent
584 * in hardirq context, etc.). Deadlines are instead
585 * computed using hard walltime. This seems to be the more
586 * natural solution, but the full ramifications of this
587 * approach need further study.
588 */
589 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
590 if (unlikely((s64)delta_exec < 0))
591 delta_exec = 0;
592
593 schedstat_set(curr->se.statistics.exec_max,
594 max(curr->se.statistics.exec_max, delta_exec));
595
596 curr->se.sum_exec_runtime += delta_exec;
597 account_group_exec_runtime(curr, delta_exec);
598
599 curr->se.exec_start = rq_clock_task(rq);
600 cpuacct_charge(curr, delta_exec);
601
602 sched_rt_avg_update(rq, delta_exec);
603
604 dl_se->runtime -= delta_exec;
605 if (dl_runtime_exceeded(rq, dl_se)) {
606 __dequeue_task_dl(rq, curr, 0);
607 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
608 dl_se->dl_throttled = 1;
609 else
610 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
611
612 if (!is_leftmost(curr, &rq->dl))
613 resched_task(curr);
614 }
615
616 /*
617 * Because -- for now -- we share the rt bandwidth, we need to
618 * account our runtime there too, otherwise actual rt tasks
619 * would be able to exceed the shared quota.
620 *
621 * Account to the root rt group for now.
622 *
623 * The solution we're working towards is having the RT groups scheduled
624 * using deadline servers -- however there's a few nasties to figure
625 * out before that can happen.
626 */
627 if (rt_bandwidth_enabled()) {
628 struct rt_rq *rt_rq = &rq->rt;
629
630 raw_spin_lock(&rt_rq->rt_runtime_lock);
631 rt_rq->rt_time += delta_exec;
632 /*
633 * We'll let actual RT tasks worry about the overflow here, we
634 * have our own CBS to keep us inline -- see above.
635 */
636 raw_spin_unlock(&rt_rq->rt_runtime_lock);
637 }
638}
639
640#ifdef CONFIG_SMP
641
642static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
643
644static inline u64 next_deadline(struct rq *rq)
645{
646 struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
647
648 if (next && dl_prio(next->prio))
649 return next->dl.deadline;
650 else
651 return 0;
652}
653
654static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
655{
656 struct rq *rq = rq_of_dl_rq(dl_rq);
657
658 if (dl_rq->earliest_dl.curr == 0 ||
659 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
660 /*
661 * If the dl_rq had no -deadline tasks, or if the new task
662 * has shorter deadline than the current one on dl_rq, we
663 * know that the previous earliest becomes our next earliest,
664 * as the new task becomes the earliest itself.
665 */
666 dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
667 dl_rq->earliest_dl.curr = deadline;
668 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
669 } else if (dl_rq->earliest_dl.next == 0 ||
670 dl_time_before(deadline, dl_rq->earliest_dl.next)) {
671 /*
672 * On the other hand, if the new -deadline task has a
673 * a later deadline than the earliest one on dl_rq, but
674 * it is earlier than the next (if any), we must
675 * recompute the next-earliest.
676 */
677 dl_rq->earliest_dl.next = next_deadline(rq);
678 }
679}
680
681static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
682{
683 struct rq *rq = rq_of_dl_rq(dl_rq);
684
685 /*
686 * Since we may have removed our earliest (and/or next earliest)
687 * task we must recompute them.
688 */
689 if (!dl_rq->dl_nr_running) {
690 dl_rq->earliest_dl.curr = 0;
691 dl_rq->earliest_dl.next = 0;
692 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
693 } else {
694 struct rb_node *leftmost = dl_rq->rb_leftmost;
695 struct sched_dl_entity *entry;
696
697 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
698 dl_rq->earliest_dl.curr = entry->deadline;
699 dl_rq->earliest_dl.next = next_deadline(rq);
700 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
701 }
702}
703
704#else
705
706static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
707static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
708
709#endif /* CONFIG_SMP */
710
711static inline
712void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
713{
714 int prio = dl_task_of(dl_se)->prio;
715 u64 deadline = dl_se->deadline;
716
717 WARN_ON(!dl_prio(prio));
718 dl_rq->dl_nr_running++;
719
720 inc_dl_deadline(dl_rq, deadline);
721 inc_dl_migration(dl_se, dl_rq);
722}
723
724static inline
725void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
726{
727 int prio = dl_task_of(dl_se)->prio;
728
729 WARN_ON(!dl_prio(prio));
730 WARN_ON(!dl_rq->dl_nr_running);
731 dl_rq->dl_nr_running--;
732
733 dec_dl_deadline(dl_rq, dl_se->deadline);
734 dec_dl_migration(dl_se, dl_rq);
735}
736
737static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
738{
739 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
740 struct rb_node **link = &dl_rq->rb_root.rb_node;
741 struct rb_node *parent = NULL;
742 struct sched_dl_entity *entry;
743 int leftmost = 1;
744
745 BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
746
747 while (*link) {
748 parent = *link;
749 entry = rb_entry(parent, struct sched_dl_entity, rb_node);
750 if (dl_time_before(dl_se->deadline, entry->deadline))
751 link = &parent->rb_left;
752 else {
753 link = &parent->rb_right;
754 leftmost = 0;
755 }
756 }
757
758 if (leftmost)
759 dl_rq->rb_leftmost = &dl_se->rb_node;
760
761 rb_link_node(&dl_se->rb_node, parent, link);
762 rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
763
764 inc_dl_tasks(dl_se, dl_rq);
765}
766
767static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
768{
769 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
770
771 if (RB_EMPTY_NODE(&dl_se->rb_node))
772 return;
773
774 if (dl_rq->rb_leftmost == &dl_se->rb_node) {
775 struct rb_node *next_node;
776
777 next_node = rb_next(&dl_se->rb_node);
778 dl_rq->rb_leftmost = next_node;
779 }
780
781 rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
782 RB_CLEAR_NODE(&dl_se->rb_node);
783
784 dec_dl_tasks(dl_se, dl_rq);
785}
786
787static void
788enqueue_dl_entity(struct sched_dl_entity *dl_se,
789 struct sched_dl_entity *pi_se, int flags)
790{
791 BUG_ON(on_dl_rq(dl_se));
792
793 /*
794 * If this is a wakeup or a new instance, the scheduling
795 * parameters of the task might need updating. Otherwise,
796 * we want a replenishment of its runtime.
797 */
798 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
799 replenish_dl_entity(dl_se, pi_se);
800 else
801 update_dl_entity(dl_se, pi_se);
802
803 __enqueue_dl_entity(dl_se);
804}
805
806static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
807{
808 __dequeue_dl_entity(dl_se);
809}
810
811static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
812{
813 struct task_struct *pi_task = rt_mutex_get_top_task(p);
814 struct sched_dl_entity *pi_se = &p->dl;
815
816 /*
817 * Use the scheduling parameters of the top pi-waiter
818 * task if we have one and its (relative) deadline is
819 * smaller than our one... OTW we keep our runtime and
820 * deadline.
821 */
822 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
823 pi_se = &pi_task->dl;
824
825 /*
826 * If p is throttled, we do nothing. In fact, if it exhausted
827 * its budget it needs a replenishment and, since it now is on
828 * its rq, the bandwidth timer callback (which clearly has not
829 * run yet) will take care of this.
830 */
831 if (p->dl.dl_throttled)
832 return;
833
834 enqueue_dl_entity(&p->dl, pi_se, flags);
835
836 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
837 enqueue_pushable_dl_task(rq, p);
838
839 inc_nr_running(rq);
840}
841
842static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
843{
844 dequeue_dl_entity(&p->dl);
845 dequeue_pushable_dl_task(rq, p);
846}
847
848static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
849{
850 update_curr_dl(rq);
851 __dequeue_task_dl(rq, p, flags);
852
853 dec_nr_running(rq);
854}
855
856/*
857 * Yield task semantic for -deadline tasks is:
858 *
859 * get off from the CPU until our next instance, with
860 * a new runtime. This is of little use now, since we
861 * don't have a bandwidth reclaiming mechanism. Anyway,
862 * bandwidth reclaiming is planned for the future, and
863 * yield_task_dl will indicate that some spare budget
864 * is available for other task instances to use it.
865 */
866static void yield_task_dl(struct rq *rq)
867{
868 struct task_struct *p = rq->curr;
869
870 /*
871 * We make the task go to sleep until its current deadline by
872 * forcing its runtime to zero. This way, update_curr_dl() stops
873 * it and the bandwidth timer will wake it up and will give it
874 * new scheduling parameters (thanks to dl_new=1).
875 */
876 if (p->dl.runtime > 0) {
877 rq->curr->dl.dl_new = 1;
878 p->dl.runtime = 0;
879 }
880 update_curr_dl(rq);
881}
882
883#ifdef CONFIG_SMP
884
885static int find_later_rq(struct task_struct *task);
886
887static int
888select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
889{
890 struct task_struct *curr;
891 struct rq *rq;
892
893 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
894 goto out;
895
896 rq = cpu_rq(cpu);
897
898 rcu_read_lock();
899 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
900
901 /*
902 * If we are dealing with a -deadline task, we must
903 * decide where to wake it up.
904 * If it has a later deadline and the current task
905 * on this rq can't move (provided the waking task
906 * can!) we prefer to send it somewhere else. On the
907 * other hand, if it has a shorter deadline, we
908 * try to make it stay here, it might be important.
909 */
910 if (unlikely(dl_task(curr)) &&
911 (curr->nr_cpus_allowed < 2 ||
912 !dl_entity_preempt(&p->dl, &curr->dl)) &&
913 (p->nr_cpus_allowed > 1)) {
914 int target = find_later_rq(p);
915
916 if (target != -1)
917 cpu = target;
918 }
919 rcu_read_unlock();
920
921out:
922 return cpu;
923}
924
925static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
926{
927 /*
928 * Current can't be migrated, useless to reschedule,
929 * let's hope p can move out.
930 */
931 if (rq->curr->nr_cpus_allowed == 1 ||
932 cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
933 return;
934
935 /*
936 * p is migratable, so let's not schedule it and
937 * see if it is pushed or pulled somewhere else.
938 */
939 if (p->nr_cpus_allowed != 1 &&
940 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
941 return;
942
943 resched_task(rq->curr);
944}
945
946#endif /* CONFIG_SMP */
947
948/*
949 * Only called when both the current and waking task are -deadline
950 * tasks.
951 */
952static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
953 int flags)
954{
955 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
956 resched_task(rq->curr);
957 return;
958 }
959
960#ifdef CONFIG_SMP
961 /*
962 * In the unlikely case current and p have the same deadline
963 * let us try to decide what's the best thing to do...
964 */
965 if ((p->dl.deadline == rq->curr->dl.deadline) &&
966 !test_tsk_need_resched(rq->curr))
967 check_preempt_equal_dl(rq, p);
968#endif /* CONFIG_SMP */
969}
970
971#ifdef CONFIG_SCHED_HRTICK
972static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
973{
974 s64 delta = p->dl.dl_runtime - p->dl.runtime;
975
976 if (delta > 10000)
977 hrtick_start(rq, p->dl.runtime);
978}
979#endif
980
981static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
982 struct dl_rq *dl_rq)
983{
984 struct rb_node *left = dl_rq->rb_leftmost;
985
986 if (!left)
987 return NULL;
988
989 return rb_entry(left, struct sched_dl_entity, rb_node);
990}
991
992struct task_struct *pick_next_task_dl(struct rq *rq)
993{
994 struct sched_dl_entity *dl_se;
995 struct task_struct *p;
996 struct dl_rq *dl_rq;
997
998 dl_rq = &rq->dl;
999
1000 if (unlikely(!dl_rq->dl_nr_running))
1001 return NULL;
1002
1003 dl_se = pick_next_dl_entity(rq, dl_rq);
1004 BUG_ON(!dl_se);
1005
1006 p = dl_task_of(dl_se);
1007 p->se.exec_start = rq_clock_task(rq);
1008
1009 /* Running task will never be pushed. */
1010 dequeue_pushable_dl_task(rq, p);
1011
1012#ifdef CONFIG_SCHED_HRTICK
1013 if (hrtick_enabled(rq))
1014 start_hrtick_dl(rq, p);
1015#endif
1016
1017#ifdef CONFIG_SMP
1018 rq->post_schedule = has_pushable_dl_tasks(rq);
1019#endif /* CONFIG_SMP */
1020
1021 return p;
1022}
1023
1024static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1025{
1026 update_curr_dl(rq);
1027
1028 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1029 enqueue_pushable_dl_task(rq, p);
1030}
1031
1032static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1033{
1034 update_curr_dl(rq);
1035
1036#ifdef CONFIG_SCHED_HRTICK
1037 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1038 start_hrtick_dl(rq, p);
1039#endif
1040}
1041
1042static void task_fork_dl(struct task_struct *p)
1043{
1044 /*
1045 * SCHED_DEADLINE tasks cannot fork and this is achieved through
1046 * sched_fork()
1047 */
1048}
1049
1050static void task_dead_dl(struct task_struct *p)
1051{
1052 struct hrtimer *timer = &p->dl.dl_timer;
1053 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1054
1055 /*
1056 * Since we are TASK_DEAD we won't slip out of the domain!
1057 */
1058 raw_spin_lock_irq(&dl_b->lock);
1059 dl_b->total_bw -= p->dl.dl_bw;
1060 raw_spin_unlock_irq(&dl_b->lock);
1061
1062 hrtimer_cancel(timer);
1063}
1064
1065static void set_curr_task_dl(struct rq *rq)
1066{
1067 struct task_struct *p = rq->curr;
1068
1069 p->se.exec_start = rq_clock_task(rq);
1070
1071 /* You can't push away the running task */
1072 dequeue_pushable_dl_task(rq, p);
1073}
1074
1075#ifdef CONFIG_SMP
1076
1077/* Only try algorithms three times */
1078#define DL_MAX_TRIES 3
1079
1080static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1081{
1082 if (!task_running(rq, p) &&
1083 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1084 (p->nr_cpus_allowed > 1))
1085 return 1;
1086
1087 return 0;
1088}
1089
1090/* Returns the second earliest -deadline task, NULL otherwise */
1091static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
1092{
1093 struct rb_node *next_node = rq->dl.rb_leftmost;
1094 struct sched_dl_entity *dl_se;
1095 struct task_struct *p = NULL;
1096
1097next_node:
1098 next_node = rb_next(next_node);
1099 if (next_node) {
1100 dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
1101 p = dl_task_of(dl_se);
1102
1103 if (pick_dl_task(rq, p, cpu))
1104 return p;
1105
1106 goto next_node;
1107 }
1108
1109 return NULL;
1110}
1111
1112static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1113
1114static int find_later_rq(struct task_struct *task)
1115{
1116 struct sched_domain *sd;
1117 struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
1118 int this_cpu = smp_processor_id();
1119 int best_cpu, cpu = task_cpu(task);
1120
1121 /* Make sure the mask is initialized first */
1122 if (unlikely(!later_mask))
1123 return -1;
1124
1125 if (task->nr_cpus_allowed == 1)
1126 return -1;
1127
1128 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1129 task, later_mask);
1130 if (best_cpu == -1)
1131 return -1;
1132
1133 /*
1134 * If we are here, some target has been found,
1135 * the most suitable of which is cached in best_cpu.
1136 * This is, among the runqueues where the current tasks
1137 * have later deadlines than the task's one, the rq
1138 * with the latest possible one.
1139 *
1140 * Now we check how well this matches with task's
1141 * affinity and system topology.
1142 *
1143 * The last cpu where the task run is our first
1144 * guess, since it is most likely cache-hot there.
1145 */
1146 if (cpumask_test_cpu(cpu, later_mask))
1147 return cpu;
1148 /*
1149 * Check if this_cpu is to be skipped (i.e., it is
1150 * not in the mask) or not.
1151 */
1152 if (!cpumask_test_cpu(this_cpu, later_mask))
1153 this_cpu = -1;
1154
1155 rcu_read_lock();
1156 for_each_domain(cpu, sd) {
1157 if (sd->flags & SD_WAKE_AFFINE) {
1158
1159 /*
1160 * If possible, preempting this_cpu is
1161 * cheaper than migrating.
1162 */
1163 if (this_cpu != -1 &&
1164 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1165 rcu_read_unlock();
1166 return this_cpu;
1167 }
1168
1169 /*
1170 * Last chance: if best_cpu is valid and is
1171 * in the mask, that becomes our choice.
1172 */
1173 if (best_cpu < nr_cpu_ids &&
1174 cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
1175 rcu_read_unlock();
1176 return best_cpu;
1177 }
1178 }
1179 }
1180 rcu_read_unlock();
1181
1182 /*
1183 * At this point, all our guesses failed, we just return
1184 * 'something', and let the caller sort the things out.
1185 */
1186 if (this_cpu != -1)
1187 return this_cpu;
1188
1189 cpu = cpumask_any(later_mask);
1190 if (cpu < nr_cpu_ids)
1191 return cpu;
1192
1193 return -1;
1194}
1195
1196/* Locks the rq it finds */
1197static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1198{
1199 struct rq *later_rq = NULL;
1200 int tries;
1201 int cpu;
1202
1203 for (tries = 0; tries < DL_MAX_TRIES; tries++) {
1204 cpu = find_later_rq(task);
1205
1206 if ((cpu == -1) || (cpu == rq->cpu))
1207 break;
1208
1209 later_rq = cpu_rq(cpu);
1210
1211 /* Retry if something changed. */
1212 if (double_lock_balance(rq, later_rq)) {
1213 if (unlikely(task_rq(task) != rq ||
1214 !cpumask_test_cpu(later_rq->cpu,
1215 &task->cpus_allowed) ||
1216 task_running(rq, task) || !task->on_rq)) {
1217 double_unlock_balance(rq, later_rq);
1218 later_rq = NULL;
1219 break;
1220 }
1221 }
1222
1223 /*
1224 * If the rq we found has no -deadline task, or
1225 * its earliest one has a later deadline than our
1226 * task, the rq is a good one.
1227 */
1228 if (!later_rq->dl.dl_nr_running ||
1229 dl_time_before(task->dl.deadline,
1230 later_rq->dl.earliest_dl.curr))
1231 break;
1232
1233 /* Otherwise we try again. */
1234 double_unlock_balance(rq, later_rq);
1235 later_rq = NULL;
1236 }
1237
1238 return later_rq;
1239}
1240
1241static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1242{
1243 struct task_struct *p;
1244
1245 if (!has_pushable_dl_tasks(rq))
1246 return NULL;
1247
1248 p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
1249 struct task_struct, pushable_dl_tasks);
1250
1251 BUG_ON(rq->cpu != task_cpu(p));
1252 BUG_ON(task_current(rq, p));
1253 BUG_ON(p->nr_cpus_allowed <= 1);
1254
1255 BUG_ON(!p->on_rq);
1256 BUG_ON(!dl_task(p));
1257
1258 return p;
1259}
1260
1261/*
1262 * See if the non running -deadline tasks on this rq
1263 * can be sent to some other CPU where they can preempt
1264 * and start executing.
1265 */
1266static int push_dl_task(struct rq *rq)
1267{
1268 struct task_struct *next_task;
1269 struct rq *later_rq;
1270
1271 if (!rq->dl.overloaded)
1272 return 0;
1273
1274 next_task = pick_next_pushable_dl_task(rq);
1275 if (!next_task)
1276 return 0;
1277
1278retry:
1279 if (unlikely(next_task == rq->curr)) {
1280 WARN_ON(1);
1281 return 0;
1282 }
1283
1284 /*
1285 * If next_task preempts rq->curr, and rq->curr
1286 * can move away, it makes sense to just reschedule
1287 * without going further in pushing next_task.
1288 */
1289 if (dl_task(rq->curr) &&
1290 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1291 rq->curr->nr_cpus_allowed > 1) {
1292 resched_task(rq->curr);
1293 return 0;
1294 }
1295
1296 /* We might release rq lock */
1297 get_task_struct(next_task);
1298
1299 /* Will lock the rq it'll find */
1300 later_rq = find_lock_later_rq(next_task, rq);
1301 if (!later_rq) {
1302 struct task_struct *task;
1303
1304 /*
1305 * We must check all this again, since
1306 * find_lock_later_rq releases rq->lock and it is
1307 * then possible that next_task has migrated.
1308 */
1309 task = pick_next_pushable_dl_task(rq);
1310 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1311 /*
1312 * The task is still there. We don't try
1313 * again, some other cpu will pull it when ready.
1314 */
1315 dequeue_pushable_dl_task(rq, next_task);
1316 goto out;
1317 }
1318
1319 if (!task)
1320 /* No more tasks */
1321 goto out;
1322
1323 put_task_struct(next_task);
1324 next_task = task;
1325 goto retry;
1326 }
1327
1328 deactivate_task(rq, next_task, 0);
1329 set_task_cpu(next_task, later_rq->cpu);
1330 activate_task(later_rq, next_task, 0);
1331
1332 resched_task(later_rq->curr);
1333
1334 double_unlock_balance(rq, later_rq);
1335
1336out:
1337 put_task_struct(next_task);
1338
1339 return 1;
1340}
1341
1342static void push_dl_tasks(struct rq *rq)
1343{
1344 /* Terminates as it moves a -deadline task */
1345 while (push_dl_task(rq))
1346 ;
1347}
1348
1349static int pull_dl_task(struct rq *this_rq)
1350{
1351 int this_cpu = this_rq->cpu, ret = 0, cpu;
1352 struct task_struct *p;
1353 struct rq *src_rq;
1354 u64 dmin = LONG_MAX;
1355
1356 if (likely(!dl_overloaded(this_rq)))
1357 return 0;
1358
1359 /*
1360 * Match the barrier from dl_set_overloaded; this guarantees that if we
1361 * see overloaded we must also see the dlo_mask bit.
1362 */
1363 smp_rmb();
1364
1365 for_each_cpu(cpu, this_rq->rd->dlo_mask) {
1366 if (this_cpu == cpu)
1367 continue;
1368
1369 src_rq = cpu_rq(cpu);
1370
1371 /*
1372 * It looks racy, abd it is! However, as in sched_rt.c,
1373 * we are fine with this.
1374 */
1375 if (this_rq->dl.dl_nr_running &&
1376 dl_time_before(this_rq->dl.earliest_dl.curr,
1377 src_rq->dl.earliest_dl.next))
1378 continue;
1379
1380 /* Might drop this_rq->lock */
1381 double_lock_balance(this_rq, src_rq);
1382
1383 /*
1384 * If there are no more pullable tasks on the
1385 * rq, we're done with it.
1386 */
1387 if (src_rq->dl.dl_nr_running <= 1)
1388 goto skip;
1389
1390 p = pick_next_earliest_dl_task(src_rq, this_cpu);
1391
1392 /*
1393 * We found a task to be pulled if:
1394 * - it preempts our current (if there's one),
1395 * - it will preempt the last one we pulled (if any).
1396 */
1397 if (p && dl_time_before(p->dl.deadline, dmin) &&
1398 (!this_rq->dl.dl_nr_running ||
1399 dl_time_before(p->dl.deadline,
1400 this_rq->dl.earliest_dl.curr))) {
1401 WARN_ON(p == src_rq->curr);
1402 WARN_ON(!p->on_rq);
1403
1404 /*
1405 * Then we pull iff p has actually an earlier
1406 * deadline than the current task of its runqueue.
1407 */
1408 if (dl_time_before(p->dl.deadline,
1409 src_rq->curr->dl.deadline))
1410 goto skip;
1411
1412 ret = 1;
1413
1414 deactivate_task(src_rq, p, 0);
1415 set_task_cpu(p, this_cpu);
1416 activate_task(this_rq, p, 0);
1417 dmin = p->dl.deadline;
1418
1419 /* Is there any other task even earlier? */
1420 }
1421skip:
1422 double_unlock_balance(this_rq, src_rq);
1423 }
1424
1425 return ret;
1426}
1427
1428static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1429{
1430 /* Try to pull other tasks here */
1431 if (dl_task(prev))
1432 pull_dl_task(rq);
1433}
1434
1435static void post_schedule_dl(struct rq *rq)
1436{
1437 push_dl_tasks(rq);
1438}
1439
1440/*
1441 * Since the task is not running and a reschedule is not going to happen
1442 * anytime soon on its runqueue, we try pushing it away now.
1443 */
1444static void task_woken_dl(struct rq *rq, struct task_struct *p)
1445{
1446 if (!task_running(rq, p) &&
1447 !test_tsk_need_resched(rq->curr) &&
1448 has_pushable_dl_tasks(rq) &&
1449 p->nr_cpus_allowed > 1 &&
1450 dl_task(rq->curr) &&
1451 (rq->curr->nr_cpus_allowed < 2 ||
1452 dl_entity_preempt(&rq->curr->dl, &p->dl))) {
1453 push_dl_tasks(rq);
1454 }
1455}
1456
1457static void set_cpus_allowed_dl(struct task_struct *p,
1458 const struct cpumask *new_mask)
1459{
1460 struct rq *rq;
1461 int weight;
1462
1463 BUG_ON(!dl_task(p));
1464
1465 /*
1466 * Update only if the task is actually running (i.e.,
1467 * it is on the rq AND it is not throttled).
1468 */
1469 if (!on_dl_rq(&p->dl))
1470 return;
1471
1472 weight = cpumask_weight(new_mask);
1473
1474 /*
1475 * Only update if the process changes its state from whether it
1476 * can migrate or not.
1477 */
1478 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1479 return;
1480
1481 rq = task_rq(p);
1482
1483 /*
1484 * The process used to be able to migrate OR it can now migrate
1485 */
1486 if (weight <= 1) {
1487 if (!task_current(rq, p))
1488 dequeue_pushable_dl_task(rq, p);
1489 BUG_ON(!rq->dl.dl_nr_migratory);
1490 rq->dl.dl_nr_migratory--;
1491 } else {
1492 if (!task_current(rq, p))
1493 enqueue_pushable_dl_task(rq, p);
1494 rq->dl.dl_nr_migratory++;
1495 }
1496
1497 update_dl_migration(&rq->dl);
1498}
1499
1500/* Assumes rq->lock is held */
1501static void rq_online_dl(struct rq *rq)
1502{
1503 if (rq->dl.overloaded)
1504 dl_set_overload(rq);
1505
1506 if (rq->dl.dl_nr_running > 0)
1507 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1508}
1509
1510/* Assumes rq->lock is held */
1511static void rq_offline_dl(struct rq *rq)
1512{
1513 if (rq->dl.overloaded)
1514 dl_clear_overload(rq);
1515
1516 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1517}
1518
1519void init_sched_dl_class(void)
1520{
1521 unsigned int i;
1522
1523 for_each_possible_cpu(i)
1524 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
1525 GFP_KERNEL, cpu_to_node(i));
1526}
1527
1528#endif /* CONFIG_SMP */
1529
1530static void switched_from_dl(struct rq *rq, struct task_struct *p)
1531{
1532 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1533 hrtimer_try_to_cancel(&p->dl.dl_timer);
1534
1535#ifdef CONFIG_SMP
1536 /*
1537 * Since this might be the only -deadline task on the rq,
1538 * this is the right place to try to pull some other one
1539 * from an overloaded cpu, if any.
1540 */
1541 if (!rq->dl.dl_nr_running)
1542 pull_dl_task(rq);
1543#endif
1544}
1545
1546/*
1547 * When switching to -deadline, we may overload the rq, then
1548 * we try to push someone off, if possible.
1549 */
1550static void switched_to_dl(struct rq *rq, struct task_struct *p)
1551{
1552 int check_resched = 1;
1553
1554 /*
1555 * If p is throttled, don't consider the possibility
1556 * of preempting rq->curr, the check will be done right
1557 * after its runtime will get replenished.
1558 */
1559 if (unlikely(p->dl.dl_throttled))
1560 return;
1561
1562 if (p->on_rq || rq->curr != p) {
1563#ifdef CONFIG_SMP
1564 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1565 /* Only reschedule if pushing failed */
1566 check_resched = 0;
1567#endif /* CONFIG_SMP */
1568 if (check_resched && task_has_dl_policy(rq->curr))
1569 check_preempt_curr_dl(rq, p, 0);
1570 }
1571}
1572
1573/*
1574 * If the scheduling parameters of a -deadline task changed,
1575 * a push or pull operation might be needed.
1576 */
1577static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1578 int oldprio)
1579{
1580 if (p->on_rq || rq->curr == p) {
1581#ifdef CONFIG_SMP
1582 /*
1583 * This might be too much, but unfortunately
1584 * we don't have the old deadline value, and
1585 * we can't argue if the task is increasing
1586 * or lowering its prio, so...
1587 */
1588 if (!rq->dl.overloaded)
1589 pull_dl_task(rq);
1590
1591 /*
1592 * If we now have a earlier deadline task than p,
1593 * then reschedule, provided p is still on this
1594 * runqueue.
1595 */
1596 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1597 rq->curr == p)
1598 resched_task(p);
1599#else
1600 /*
1601 * Again, we don't know if p has a earlier
1602 * or later deadline, so let's blindly set a
1603 * (maybe not needed) rescheduling point.
1604 */
1605 resched_task(p);
1606#endif /* CONFIG_SMP */
1607 } else
1608 switched_to_dl(rq, p);
1609}
1610
1611const struct sched_class dl_sched_class = {
1612 .next = &rt_sched_class,
1613 .enqueue_task = enqueue_task_dl,
1614 .dequeue_task = dequeue_task_dl,
1615 .yield_task = yield_task_dl,
1616
1617 .check_preempt_curr = check_preempt_curr_dl,
1618
1619 .pick_next_task = pick_next_task_dl,
1620 .put_prev_task = put_prev_task_dl,
1621
1622#ifdef CONFIG_SMP
1623 .select_task_rq = select_task_rq_dl,
1624 .set_cpus_allowed = set_cpus_allowed_dl,
1625 .rq_online = rq_online_dl,
1626 .rq_offline = rq_offline_dl,
1627 .pre_schedule = pre_schedule_dl,
1628 .post_schedule = post_schedule_dl,
1629 .task_woken = task_woken_dl,
1630#endif
1631
1632 .set_curr_task = set_curr_task_dl,
1633 .task_tick = task_tick_dl,
1634 .task_fork = task_fork_dl,
1635 .task_dead = task_dead_dl,
1636
1637 .prio_changed = prio_changed_dl,
1638 .switched_from = switched_from_dl,
1639 .switched_to = switched_to_dl,
1640};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d1817e8f..dd52e7ffb10e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
140#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING 141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); 142 SEQ_printf(m, " %d", task_node(p));
143#endif 143#endif
144#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
145 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
371 PN(cpu_clk); 371 PN(cpu_clk);
372 P(jiffies); 372 P(jiffies);
373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
374 P(sched_clock_stable); 374 P(sched_clock_stable());
375#endif 375#endif
376#undef PN 376#undef PN
377#undef P 377#undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e64b0794060e..b24b6cfde9aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
872 return max(smin, smax); 872 return max(smin, smax);
873} 873}
874 874
875/*
876 * Once a preferred node is selected the scheduler balancer will prefer moving
877 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
878 * scans. This will give the process the chance to accumulate more faults on
879 * the preferred node but still allow the scheduler to move the task again if
880 * the nodes CPUs are overloaded.
881 */
882unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883
884static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 875static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
885{ 876{
886 rq->nr_numa_running += (p->numa_preferred_nid != -1); 877 rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
930 if (!p->numa_group) 921 if (!p->numa_group)
931 return 0; 922 return 0;
932 923
933 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; 924 return p->numa_group->faults[task_faults_idx(nid, 0)] +
925 p->numa_group->faults[task_faults_idx(nid, 1)];
934} 926}
935 927
936/* 928/*
@@ -1023,7 +1015,7 @@ struct task_numa_env {
1023 1015
1024 struct numa_stats src_stats, dst_stats; 1016 struct numa_stats src_stats, dst_stats;
1025 1017
1026 int imbalance_pct, idx; 1018 int imbalance_pct;
1027 1019
1028 struct task_struct *best_task; 1020 struct task_struct *best_task;
1029 long best_imp; 1021 long best_imp;
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
1211 * elsewhere, so there is no point in (re)trying. 1203 * elsewhere, so there is no point in (re)trying.
1212 */ 1204 */
1213 if (unlikely(!sd)) { 1205 if (unlikely(!sd)) {
1214 p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1206 p->numa_preferred_nid = task_node(p);
1215 return -EINVAL; 1207 return -EINVAL;
1216 } 1208 }
1217 1209
@@ -1278,7 +1270,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1278 p->numa_migrate_retry = jiffies + HZ; 1270 p->numa_migrate_retry = jiffies + HZ;
1279 1271
1280 /* Success if task is already running on preferred CPU */ 1272 /* Success if task is already running on preferred CPU */
1281 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1273 if (task_node(p) == p->numa_preferred_nid)
1282 return; 1274 return;
1283 1275
1284 /* Otherwise, try migrate to a CPU on the preferred node */ 1276 /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1350,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p,
1350 * scanning faster if shared accesses dominate as it may 1342 * scanning faster if shared accesses dominate as it may
1351 * simply bounce migrations uselessly 1343 * simply bounce migrations uselessly
1352 */ 1344 */
1353 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1354 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1345 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1355 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1346 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1356 } 1347 }
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4101 */ 4092 */
4102static struct sched_group * 4093static struct sched_group *
4103find_idlest_group(struct sched_domain *sd, struct task_struct *p, 4094find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4104 int this_cpu, int load_idx) 4095 int this_cpu, int sd_flag)
4105{ 4096{
4106 struct sched_group *idlest = NULL, *group = sd->groups; 4097 struct sched_group *idlest = NULL, *group = sd->groups;
4107 unsigned long min_load = ULONG_MAX, this_load = 0; 4098 unsigned long min_load = ULONG_MAX, this_load = 0;
4099 int load_idx = sd->forkexec_idx;
4108 int imbalance = 100 + (sd->imbalance_pct-100)/2; 4100 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4109 4101
4102 if (sd_flag & SD_BALANCE_WAKE)
4103 load_idx = sd->wake_idx;
4104
4110 do { 4105 do {
4111 unsigned long load, avg_load; 4106 unsigned long load, avg_load;
4112 int local_group; 4107 int local_group;
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4274 } 4269 }
4275 4270
4276 while (sd) { 4271 while (sd) {
4277 int load_idx = sd->forkexec_idx;
4278 struct sched_group *group; 4272 struct sched_group *group;
4279 int weight; 4273 int weight;
4280 4274
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4283 continue; 4277 continue;
4284 } 4278 }
4285 4279
4286 if (sd_flag & SD_BALANCE_WAKE) 4280 group = find_idlest_group(sd, p, cpu, sd_flag);
4287 load_idx = sd->wake_idx;
4288
4289 group = find_idlest_group(sd, p, cpu, load_idx);
4290 if (!group) { 4281 if (!group) {
4291 sd = sd->child; 4282 sd = sd->child;
4292 continue; 4283 continue;
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5512 struct sched_group *group, int load_idx, 5503 struct sched_group *group, int load_idx,
5513 int local_group, struct sg_lb_stats *sgs) 5504 int local_group, struct sg_lb_stats *sgs)
5514{ 5505{
5515 unsigned long nr_running;
5516 unsigned long load; 5506 unsigned long load;
5517 int i; 5507 int i;
5518 5508
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5511 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5522 struct rq *rq = cpu_rq(i); 5512 struct rq *rq = cpu_rq(i);
5523 5513
5524 nr_running = rq->nr_running;
5525
5526 /* Bias balancing toward cpus of our domain */ 5514 /* Bias balancing toward cpus of our domain */
5527 if (local_group) 5515 if (local_group)
5528 load = target_load(i, load_idx); 5516 load = target_load(i, load_idx);
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5530 load = source_load(i, load_idx); 5518 load = source_load(i, load_idx);
5531 5519
5532 sgs->group_load += load; 5520 sgs->group_load += load;
5533 sgs->sum_nr_running += nr_running; 5521 sgs->sum_nr_running += rq->nr_running;
5534#ifdef CONFIG_NUMA_BALANCING 5522#ifdef CONFIG_NUMA_BALANCING
5535 sgs->nr_numa_running += rq->nr_numa_running; 5523 sgs->nr_numa_running += rq->nr_numa_running;
5536 sgs->nr_preferred_running += rq->nr_preferred_running; 5524 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6521,7 +6509,7 @@ static struct {
6521 unsigned long next_balance; /* in jiffy units */ 6509 unsigned long next_balance; /* in jiffy units */
6522} nohz ____cacheline_aligned; 6510} nohz ____cacheline_aligned;
6523 6511
6524static inline int find_new_ilb(int call_cpu) 6512static inline int find_new_ilb(void)
6525{ 6513{
6526 int ilb = cpumask_first(nohz.idle_cpus_mask); 6514 int ilb = cpumask_first(nohz.idle_cpus_mask);
6527 6515
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
6536 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 6524 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6537 * CPU (if there is one). 6525 * CPU (if there is one).
6538 */ 6526 */
6539static void nohz_balancer_kick(int cpu) 6527static void nohz_balancer_kick(void)
6540{ 6528{
6541 int ilb_cpu; 6529 int ilb_cpu;
6542 6530
6543 nohz.next_balance++; 6531 nohz.next_balance++;
6544 6532
6545 ilb_cpu = find_new_ilb(cpu); 6533 ilb_cpu = find_new_ilb();
6546 6534
6547 if (ilb_cpu >= nr_cpu_ids) 6535 if (ilb_cpu >= nr_cpu_ids)
6548 return; 6536 return;
@@ -6652,10 +6640,10 @@ void update_max_interval(void)
6652 * 6640 *
6653 * Balancing parameters are set up in init_sched_domains. 6641 * Balancing parameters are set up in init_sched_domains.
6654 */ 6642 */
6655static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6643static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6656{ 6644{
6657 int continue_balancing = 1; 6645 int continue_balancing = 1;
6658 struct rq *rq = cpu_rq(cpu); 6646 int cpu = rq->cpu;
6659 unsigned long interval; 6647 unsigned long interval;
6660 struct sched_domain *sd; 6648 struct sched_domain *sd;
6661 /* Earliest time when we have to do rebalance again */ 6649 /* Earliest time when we have to do rebalance again */
@@ -6752,9 +6740,9 @@ out:
6752 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 6740 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6753 * rebalancing for all the cpus for whom scheduler ticks are stopped. 6741 * rebalancing for all the cpus for whom scheduler ticks are stopped.
6754 */ 6742 */
6755static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 6743static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
6756{ 6744{
6757 struct rq *this_rq = cpu_rq(this_cpu); 6745 int this_cpu = this_rq->cpu;
6758 struct rq *rq; 6746 struct rq *rq;
6759 int balance_cpu; 6747 int balance_cpu;
6760 6748
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6781 update_idle_cpu_load(rq); 6769 update_idle_cpu_load(rq);
6782 raw_spin_unlock_irq(&rq->lock); 6770 raw_spin_unlock_irq(&rq->lock);
6783 6771
6784 rebalance_domains(balance_cpu, CPU_IDLE); 6772 rebalance_domains(rq, CPU_IDLE);
6785 6773
6786 if (time_after(this_rq->next_balance, rq->next_balance)) 6774 if (time_after(this_rq->next_balance, rq->next_balance))
6787 this_rq->next_balance = rq->next_balance; 6775 this_rq->next_balance = rq->next_balance;
@@ -6800,14 +6788,14 @@ end:
6800 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 6788 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6801 * domain span are idle. 6789 * domain span are idle.
6802 */ 6790 */
6803static inline int nohz_kick_needed(struct rq *rq, int cpu) 6791static inline int nohz_kick_needed(struct rq *rq)
6804{ 6792{
6805 unsigned long now = jiffies; 6793 unsigned long now = jiffies;
6806 struct sched_domain *sd; 6794 struct sched_domain *sd;
6807 struct sched_group_power *sgp; 6795 struct sched_group_power *sgp;
6808 int nr_busy; 6796 int nr_busy, cpu = rq->cpu;
6809 6797
6810 if (unlikely(idle_cpu(cpu))) 6798 if (unlikely(rq->idle_balance))
6811 return 0; 6799 return 0;
6812 6800
6813 /* 6801 /*
@@ -6856,7 +6844,7 @@ need_kick:
6856 return 1; 6844 return 1;
6857} 6845}
6858#else 6846#else
6859static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 6847static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
6860#endif 6848#endif
6861 6849
6862/* 6850/*
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6865 */ 6853 */
6866static void run_rebalance_domains(struct softirq_action *h) 6854static void run_rebalance_domains(struct softirq_action *h)
6867{ 6855{
6868 int this_cpu = smp_processor_id(); 6856 struct rq *this_rq = this_rq();
6869 struct rq *this_rq = cpu_rq(this_cpu);
6870 enum cpu_idle_type idle = this_rq->idle_balance ? 6857 enum cpu_idle_type idle = this_rq->idle_balance ?
6871 CPU_IDLE : CPU_NOT_IDLE; 6858 CPU_IDLE : CPU_NOT_IDLE;
6872 6859
6873 rebalance_domains(this_cpu, idle); 6860 rebalance_domains(this_rq, idle);
6874 6861
6875 /* 6862 /*
6876 * If this cpu has a pending nohz_balance_kick, then do the 6863 * If this cpu has a pending nohz_balance_kick, then do the
6877 * balancing on behalf of the other idle cpus whose ticks are 6864 * balancing on behalf of the other idle cpus whose ticks are
6878 * stopped. 6865 * stopped.
6879 */ 6866 */
6880 nohz_idle_balance(this_cpu, idle); 6867 nohz_idle_balance(this_rq, idle);
6881} 6868}
6882 6869
6883static inline int on_null_domain(int cpu) 6870static inline int on_null_domain(struct rq *rq)
6884{ 6871{
6885 return !rcu_dereference_sched(cpu_rq(cpu)->sd); 6872 return !rcu_dereference_sched(rq->sd);
6886} 6873}
6887 6874
6888/* 6875/*
6889 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6890 */ 6877 */
6891void trigger_load_balance(struct rq *rq, int cpu) 6878void trigger_load_balance(struct rq *rq)
6892{ 6879{
6893 /* Don't need to rebalance while attached to NULL domain */ 6880 /* Don't need to rebalance while attached to NULL domain */
6894 if (time_after_eq(jiffies, rq->next_balance) && 6881 if (unlikely(on_null_domain(rq)))
6895 likely(!on_null_domain(cpu))) 6882 return;
6883
6884 if (time_after_eq(jiffies, rq->next_balance))
6896 raise_softirq(SCHED_SOFTIRQ); 6885 raise_softirq(SCHED_SOFTIRQ);
6897#ifdef CONFIG_NO_HZ_COMMON 6886#ifdef CONFIG_NO_HZ_COMMON
6898 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 6887 if (nohz_kick_needed(rq))
6899 nohz_balancer_kick(cpu); 6888 nohz_balancer_kick();
6900#endif 6889#endif
6901} 6890}
6902 6891
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1c4065575fa2..a2740b775b45 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1738 !test_tsk_need_resched(rq->curr) && 1738 !test_tsk_need_resched(rq->curr) &&
1739 has_pushable_tasks(rq) && 1739 has_pushable_tasks(rq) &&
1740 p->nr_cpus_allowed > 1 && 1740 p->nr_cpus_allowed > 1 &&
1741 rt_task(rq->curr) && 1741 (dl_task(rq->curr) || rt_task(rq->curr)) &&
1742 (rq->curr->nr_cpus_allowed < 2 || 1742 (rq->curr->nr_cpus_allowed < 2 ||
1743 rq->curr->prio <= p->prio)) 1743 rq->curr->prio <= p->prio))
1744 push_rt_tasks(rq); 1744 push_rt_tasks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b21d633..c2119fd20f8b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h>
5#include <linux/mutex.h> 6#include <linux/mutex.h>
6#include <linux/spinlock.h> 7#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
@@ -9,6 +10,7 @@
9#include <linux/slab.h> 10#include <linux/slab.h>
10 11
11#include "cpupri.h" 12#include "cpupri.h"
13#include "cpudeadline.h"
12#include "cpuacct.h" 14#include "cpuacct.h"
13 15
14struct rq; 16struct rq;
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
73#define NICE_0_SHIFT SCHED_LOAD_SHIFT 75#define NICE_0_SHIFT SCHED_LOAD_SHIFT
74 76
75/* 77/*
78 * Single value that decides SCHED_DEADLINE internal math precision.
79 * 10 -> just above 1us
80 * 9 -> just above 0.5us
81 */
82#define DL_SCALE (10)
83
84/*
76 * These are the 'tuning knobs' of the scheduler: 85 * These are the 'tuning knobs' of the scheduler:
77 */ 86 */
78 87
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
81 */ 90 */
82#define RUNTIME_INF ((u64)~0ULL) 91#define RUNTIME_INF ((u64)~0ULL)
83 92
93static inline int fair_policy(int policy)
94{
95 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
96}
97
84static inline int rt_policy(int policy) 98static inline int rt_policy(int policy)
85{ 99{
86 if (policy == SCHED_FIFO || policy == SCHED_RR) 100 return policy == SCHED_FIFO || policy == SCHED_RR;
87 return 1; 101}
88 return 0; 102
103static inline int dl_policy(int policy)
104{
105 return policy == SCHED_DEADLINE;
89} 106}
90 107
91static inline int task_has_rt_policy(struct task_struct *p) 108static inline int task_has_rt_policy(struct task_struct *p)
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
93 return rt_policy(p->policy); 110 return rt_policy(p->policy);
94} 111}
95 112
113static inline int task_has_dl_policy(struct task_struct *p)
114{
115 return dl_policy(p->policy);
116}
117
118static inline bool dl_time_before(u64 a, u64 b)
119{
120 return (s64)(a - b) < 0;
121}
122
123/*
124 * Tells if entity @a should preempt entity @b.
125 */
126static inline bool
127dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
128{
129 return dl_time_before(a->deadline, b->deadline);
130}
131
96/* 132/*
97 * This is the priority-queue data structure of the RT scheduling class: 133 * This is the priority-queue data structure of the RT scheduling class:
98 */ 134 */
@@ -108,6 +144,47 @@ struct rt_bandwidth {
108 u64 rt_runtime; 144 u64 rt_runtime;
109 struct hrtimer rt_period_timer; 145 struct hrtimer rt_period_timer;
110}; 146};
147/*
148 * To keep the bandwidth of -deadline tasks and groups under control
149 * we need some place where:
150 * - store the maximum -deadline bandwidth of the system (the group);
151 * - cache the fraction of that bandwidth that is currently allocated.
152 *
153 * This is all done in the data structure below. It is similar to the
154 * one used for RT-throttling (rt_bandwidth), with the main difference
155 * that, since here we are only interested in admission control, we
156 * do not decrease any runtime while the group "executes", neither we
157 * need a timer to replenish it.
158 *
159 * With respect to SMP, the bandwidth is given on a per-CPU basis,
160 * meaning that:
161 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
162 * - dl_total_bw array contains, in the i-eth element, the currently
163 * allocated bandwidth on the i-eth CPU.
164 * Moreover, groups consume bandwidth on each CPU, while tasks only
165 * consume bandwidth on the CPU they're running on.
166 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
167 * that will be shown the next time the proc or cgroup controls will
168 * be red. It on its turn can be changed by writing on its own
169 * control.
170 */
171struct dl_bandwidth {
172 raw_spinlock_t dl_runtime_lock;
173 u64 dl_runtime;
174 u64 dl_period;
175};
176
177static inline int dl_bandwidth_enabled(void)
178{
179 return sysctl_sched_rt_runtime >= 0;
180}
181
182extern struct dl_bw *dl_bw_of(int i);
183
184struct dl_bw {
185 raw_spinlock_t lock;
186 u64 bw, total_bw;
187};
111 188
112extern struct mutex sched_domains_mutex; 189extern struct mutex sched_domains_mutex;
113 190
@@ -364,6 +441,42 @@ struct rt_rq {
364#endif 441#endif
365}; 442};
366 443
444/* Deadline class' related fields in a runqueue */
445struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */
447 struct rb_root rb_root;
448 struct rb_node *rb_leftmost;
449
450 unsigned long dl_nr_running;
451
452#ifdef CONFIG_SMP
453 /*
454 * Deadline values of the currently executing and the
455 * earliest ready task on this rq. Caching these facilitates
456 * the decision wether or not a ready but not running task
457 * should migrate somewhere else.
458 */
459 struct {
460 u64 curr;
461 u64 next;
462 } earliest_dl;
463
464 unsigned long dl_nr_migratory;
465 unsigned long dl_nr_total;
466 int overloaded;
467
468 /*
469 * Tasks on this rq that can be pushed away. They are kept in
470 * an rb-tree, ordered by tasks' deadlines, with caching
471 * of the leftmost (earliest deadline) element.
472 */
473 struct rb_root pushable_dl_tasks_root;
474 struct rb_node *pushable_dl_tasks_leftmost;
475#else
476 struct dl_bw dl_bw;
477#endif
478};
479
367#ifdef CONFIG_SMP 480#ifdef CONFIG_SMP
368 481
369/* 482/*
@@ -382,6 +495,15 @@ struct root_domain {
382 cpumask_var_t online; 495 cpumask_var_t online;
383 496
384 /* 497 /*
498 * The bit corresponding to a CPU gets set here if such CPU has more
499 * than one runnable -deadline task (as it is below for RT tasks).
500 */
501 cpumask_var_t dlo_mask;
502 atomic_t dlo_count;
503 struct dl_bw dl_bw;
504 struct cpudl cpudl;
505
506 /*
385 * The "RT overload" flag: it gets set if a CPU has more than 507 * The "RT overload" flag: it gets set if a CPU has more than
386 * one runnable RT task. 508 * one runnable RT task.
387 */ 509 */
@@ -432,6 +554,7 @@ struct rq {
432 554
433 struct cfs_rq cfs; 555 struct cfs_rq cfs;
434 struct rt_rq rt; 556 struct rt_rq rt;
557 struct dl_rq dl;
435 558
436#ifdef CONFIG_FAIR_GROUP_SCHED 559#ifdef CONFIG_FAIR_GROUP_SCHED
437 /* list of leaf cfs_rq on this cpu: */ 560 /* list of leaf cfs_rq on this cpu: */
@@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void)
827 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 950 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
828} 951}
829 952
830
831
832static inline int task_current(struct rq *rq, struct task_struct *p) 953static inline int task_current(struct rq *rq, struct task_struct *p)
833{ 954{
834 return rq->curr == p; 955 return rq->curr == p;
@@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = {
988#else 1109#else
989#define ENQUEUE_WAKING 0 1110#define ENQUEUE_WAKING 0
990#endif 1111#endif
1112#define ENQUEUE_REPLENISH 8
991 1113
992#define DEQUEUE_SLEEP 1 1114#define DEQUEUE_SLEEP 1
993 1115
@@ -1023,6 +1145,7 @@ struct sched_class {
1023 void (*set_curr_task) (struct rq *rq); 1145 void (*set_curr_task) (struct rq *rq);
1024 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1146 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1025 void (*task_fork) (struct task_struct *p); 1147 void (*task_fork) (struct task_struct *p);
1148 void (*task_dead) (struct task_struct *p);
1026 1149
1027 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1150 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1028 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1151 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@@ -1042,6 +1165,7 @@ struct sched_class {
1042 for (class = sched_class_highest; class; class = class->next) 1165 for (class = sched_class_highest; class; class = class->next)
1043 1166
1044extern const struct sched_class stop_sched_class; 1167extern const struct sched_class stop_sched_class;
1168extern const struct sched_class dl_sched_class;
1045extern const struct sched_class rt_sched_class; 1169extern const struct sched_class rt_sched_class;
1046extern const struct sched_class fair_sched_class; 1170extern const struct sched_class fair_sched_class;
1047extern const struct sched_class idle_sched_class; 1171extern const struct sched_class idle_sched_class;
@@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class;
1051 1175
1052extern void update_group_power(struct sched_domain *sd, int cpu); 1176extern void update_group_power(struct sched_domain *sd, int cpu);
1053 1177
1054extern void trigger_load_balance(struct rq *rq, int cpu); 1178extern void trigger_load_balance(struct rq *rq);
1055extern void idle_balance(int this_cpu, struct rq *this_rq); 1179extern void idle_balance(int this_cpu, struct rq *this_rq);
1056 1180
1057extern void idle_enter_fair(struct rq *this_rq); 1181extern void idle_enter_fair(struct rq *this_rq);
@@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
1068extern void sysrq_sched_debug_show(void); 1192extern void sysrq_sched_debug_show(void);
1069extern void sched_init_granularity(void); 1193extern void sched_init_granularity(void);
1070extern void update_max_interval(void); 1194extern void update_max_interval(void);
1195
1196extern void init_sched_dl_class(void);
1071extern void init_sched_rt_class(void); 1197extern void init_sched_rt_class(void);
1072extern void init_sched_fair_class(void); 1198extern void init_sched_fair_class(void);
1199extern void init_sched_dl_class(void);
1073 1200
1074extern void resched_task(struct task_struct *p); 1201extern void resched_task(struct task_struct *p);
1075extern void resched_cpu(int cpu); 1202extern void resched_cpu(int cpu);
@@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu);
1077extern struct rt_bandwidth def_rt_bandwidth; 1204extern struct rt_bandwidth def_rt_bandwidth;
1078extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1205extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1079 1206
1207extern struct dl_bandwidth def_dl_bandwidth;
1208extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1209extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1210
1211unsigned long to_ratio(u64 period, u64 runtime);
1212
1080extern void update_idle_cpu_load(struct rq *this_rq); 1213extern void update_idle_cpu_load(struct rq *this_rq);
1081 1214
1082extern void init_task_runnable_average(struct task_struct *p); 1215extern void init_task_runnable_average(struct task_struct *p);
@@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1353 1486
1354extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1487extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1488extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1489extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
1356 1490
1357extern void cfs_bandwidth_usage_inc(void); 1491extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void); 1492extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
103 * Simple, special scheduling class for the per-CPU stop tasks: 103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */ 104 */
105const struct sched_class stop_sched_class = { 105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class, 106 .next = &dl_sched_class,
107 107
108 .enqueue_task = enqueue_task_stop, 108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop, 109 .dequeue_task = dequeue_task_stop,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9a4500e4c189..8b93b3770f85 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -89,7 +89,7 @@ static void wakeup_softirqd(void)
89 * where hardirqs are disabled legitimately: 89 * where hardirqs are disabled legitimately:
90 */ 90 */
91#ifdef CONFIG_TRACE_IRQFLAGS 91#ifdef CONFIG_TRACE_IRQFLAGS
92static void __local_bh_disable(unsigned long ip, unsigned int cnt) 92void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
93{ 93{
94 unsigned long flags; 94 unsigned long flags;
95 95
@@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
107 /* 107 /*
108 * Were softirqs turned off above: 108 * Were softirqs turned off above:
109 */ 109 */
110 if (softirq_count() == cnt) 110 if (softirq_count() == (cnt & SOFTIRQ_MASK))
111 trace_softirqs_off(ip); 111 trace_softirqs_off(ip);
112 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
113 113
114 if (preempt_count() == cnt) 114 if (preempt_count() == cnt)
115 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 115 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
116} 116}
117#else /* !CONFIG_TRACE_IRQFLAGS */ 117EXPORT_SYMBOL(__local_bh_disable_ip);
118static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
119{
120 preempt_count_add(cnt);
121 barrier();
122}
123#endif /* CONFIG_TRACE_IRQFLAGS */ 118#endif /* CONFIG_TRACE_IRQFLAGS */
124 119
125void local_bh_disable(void)
126{
127 __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
128}
129
130EXPORT_SYMBOL(local_bh_disable);
131
132static void __local_bh_enable(unsigned int cnt) 120static void __local_bh_enable(unsigned int cnt)
133{ 121{
134 WARN_ON_ONCE(!irqs_disabled()); 122 WARN_ON_ONCE(!irqs_disabled());
135 123
136 if (softirq_count() == cnt) 124 if (softirq_count() == (cnt & SOFTIRQ_MASK))
137 trace_softirqs_on(_RET_IP_); 125 trace_softirqs_on(_RET_IP_);
138 preempt_count_sub(cnt); 126 preempt_count_sub(cnt);
139} 127}
@@ -151,7 +139,7 @@ void _local_bh_enable(void)
151 139
152EXPORT_SYMBOL(_local_bh_enable); 140EXPORT_SYMBOL(_local_bh_enable);
153 141
154static inline void _local_bh_enable_ip(unsigned long ip) 142void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
155{ 143{
156 WARN_ON_ONCE(in_irq() || irqs_disabled()); 144 WARN_ON_ONCE(in_irq() || irqs_disabled());
157#ifdef CONFIG_TRACE_IRQFLAGS 145#ifdef CONFIG_TRACE_IRQFLAGS
@@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
166 * Keep preemption disabled until we are done with 154 * Keep preemption disabled until we are done with
167 * softirq processing: 155 * softirq processing:
168 */ 156 */
169 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); 157 preempt_count_sub(cnt - 1);
170 158
171 if (unlikely(!in_interrupt() && local_softirq_pending())) { 159 if (unlikely(!in_interrupt() && local_softirq_pending())) {
172 /* 160 /*
@@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
182#endif 170#endif
183 preempt_check_resched(); 171 preempt_check_resched();
184} 172}
185 173EXPORT_SYMBOL(__local_bh_enable_ip);
186void local_bh_enable(void)
187{
188 _local_bh_enable_ip(_RET_IP_);
189}
190EXPORT_SYMBOL(local_bh_enable);
191
192void local_bh_enable_ip(unsigned long ip)
193{
194 _local_bh_enable_ip(ip);
195}
196EXPORT_SYMBOL(local_bh_enable_ip);
197 174
198/* 175/*
199 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, 176 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
@@ -264,7 +241,7 @@ asmlinkage void __do_softirq(void)
264 pending = local_softirq_pending(); 241 pending = local_softirq_pending();
265 account_irq_enter_time(current); 242 account_irq_enter_time(current);
266 243
267 __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); 244 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
268 in_hardirq = lockdep_softirq_start(); 245 in_hardirq = lockdep_softirq_start();
269 246
270 cpu = smp_processor_id(); 247 cpu = smp_processor_id();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 34a604726d0b..c8da99f905cf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -385,13 +385,6 @@ static struct ctl_table kern_table[] = {
385 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
386 }, 386 },
387 { 387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred", 388 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred, 389 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int), 390 .maxlen = sizeof(unsigned int),
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ea20f7d1ac2c..c833249ab0fb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -177,7 +177,7 @@ static bool can_stop_full_tick(void)
177 * TODO: kick full dynticks CPUs when 177 * TODO: kick full dynticks CPUs when
178 * sched_clock_stable is set. 178 * sched_clock_stable is set.
179 */ 179 */
180 if (!sched_clock_stable) { 180 if (!sched_clock_stable()) {
181 trace_tick_stop(0, "unstable sched clock\n"); 181 trace_tick_stop(0, "unstable sched clock\n");
182 /* 182 /*
183 * Don't allow the user to think they can get 183 * Don't allow the user to think they can get
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc2f66f68dc5..294b8a271a04 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2558 if (unlikely(test_time_stamp(delta))) { 2558 if (unlikely(test_time_stamp(delta))) {
2559 int local_clock_stable = 1; 2559 int local_clock_stable = 1;
2560#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 2560#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2561 local_clock_stable = sched_clock_stable; 2561 local_clock_stable = sched_clock_stable();
2562#endif 2562#endif
2563 WARN_ONCE(delta > (1ULL << 59), 2563 WARN_ONCE(delta > (1ULL << 59),
2564 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", 2564 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fee77e15d815..6e32635e5e57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -16,6 +16,7 @@
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h> 18#include <linux/sched/rt.h>
19#include <linux/sched/deadline.h>
19#include <trace/events/sched.h> 20#include <trace/events/sched.h>
20#include "trace.h" 21#include "trace.h"
21 22
@@ -27,6 +28,8 @@ static int wakeup_cpu;
27static int wakeup_current_cpu; 28static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 29static unsigned wakeup_prio = -1;
29static int wakeup_rt; 30static int wakeup_rt;
31static int wakeup_dl;
32static int tracing_dl = 0;
30 33
31static arch_spinlock_t wakeup_lock = 34static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 35 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr)
437{ 440{
438 wakeup_cpu = -1; 441 wakeup_cpu = -1;
439 wakeup_prio = -1; 442 wakeup_prio = -1;
443 tracing_dl = 0;
440 444
441 if (wakeup_task) 445 if (wakeup_task)
442 put_task_struct(wakeup_task); 446 put_task_struct(wakeup_task);
@@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472 tracing_record_cmdline(p); 476 tracing_record_cmdline(p);
473 tracing_record_cmdline(current); 477 tracing_record_cmdline(current);
474 478
475 if ((wakeup_rt && !rt_task(p)) || 479 /*
476 p->prio >= wakeup_prio || 480 * Semantic is like this:
477 p->prio >= current->prio) 481 * - wakeup tracer handles all tasks in the system, independently
482 * from their scheduling class;
483 * - wakeup_rt tracer handles tasks belonging to sched_dl and
484 * sched_rt class;
485 * - wakeup_dl handles tasks belonging to sched_dl class only.
486 */
487 if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
488 (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
489 (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
478 return; 490 return;
479 491
480 pc = preempt_count(); 492 pc = preempt_count();
@@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
486 arch_spin_lock(&wakeup_lock); 498 arch_spin_lock(&wakeup_lock);
487 499
488 /* check for races. */ 500 /* check for races. */
489 if (!tracer_enabled || p->prio >= wakeup_prio) 501 if (!tracer_enabled || tracing_dl ||
502 (!dl_task(p) && p->prio >= wakeup_prio))
490 goto out_locked; 503 goto out_locked;
491 504
492 /* reset the trace */ 505 /* reset the trace */
@@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
496 wakeup_current_cpu = wakeup_cpu; 509 wakeup_current_cpu = wakeup_cpu;
497 wakeup_prio = p->prio; 510 wakeup_prio = p->prio;
498 511
512 /*
513 * Once you start tracing a -deadline task, don't bother tracing
514 * another task until the first one wakes up.
515 */
516 if (dl_task(p))
517 tracing_dl = 1;
518 else
519 tracing_dl = 0;
520
499 wakeup_task = p; 521 wakeup_task = p;
500 get_task_struct(wakeup_task); 522 get_task_struct(wakeup_task);
501 523
@@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr)
597 619
598static int wakeup_tracer_init(struct trace_array *tr) 620static int wakeup_tracer_init(struct trace_array *tr)
599{ 621{
622 wakeup_dl = 0;
600 wakeup_rt = 0; 623 wakeup_rt = 0;
601 return __wakeup_tracer_init(tr); 624 return __wakeup_tracer_init(tr);
602} 625}
603 626
604static int wakeup_rt_tracer_init(struct trace_array *tr) 627static int wakeup_rt_tracer_init(struct trace_array *tr)
605{ 628{
629 wakeup_dl = 0;
606 wakeup_rt = 1; 630 wakeup_rt = 1;
607 return __wakeup_tracer_init(tr); 631 return __wakeup_tracer_init(tr);
608} 632}
609 633
634static int wakeup_dl_tracer_init(struct trace_array *tr)
635{
636 wakeup_dl = 1;
637 wakeup_rt = 0;
638 return __wakeup_tracer_init(tr);
639}
640
610static void wakeup_tracer_reset(struct trace_array *tr) 641static void wakeup_tracer_reset(struct trace_array *tr)
611{ 642{
612 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; 643 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
@@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly =
674 .use_max_tr = true, 705 .use_max_tr = true,
675}; 706};
676 707
708static struct tracer wakeup_dl_tracer __read_mostly =
709{
710 .name = "wakeup_dl",
711 .init = wakeup_dl_tracer_init,
712 .reset = wakeup_tracer_reset,
713 .start = wakeup_tracer_start,
714 .stop = wakeup_tracer_stop,
715 .wait_pipe = poll_wait_pipe,
716 .print_max = true,
717 .print_header = wakeup_print_header,
718 .print_line = wakeup_print_line,
719 .flags = &tracer_flags,
720 .set_flag = wakeup_set_flag,
721 .flag_changed = wakeup_flag_changed,
722#ifdef CONFIG_FTRACE_SELFTEST
723 .selftest = trace_selftest_startup_wakeup,
724#endif
725 .open = wakeup_trace_open,
726 .close = wakeup_trace_close,
727 .use_max_tr = true,
728};
729
677__init static int init_wakeup_tracer(void) 730__init static int init_wakeup_tracer(void)
678{ 731{
679 int ret; 732 int ret;
@@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void)
686 if (ret) 739 if (ret)
687 return ret; 740 return ret;
688 741
742 ret = register_tracer(&wakeup_dl_tracer);
743 if (ret)
744 return ret;
745
689 return 0; 746 return 0;
690} 747}
691core_initcall(init_wakeup_tracer); 748core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a7329b7902f8..e98fca60974f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
1022#ifdef CONFIG_SCHED_TRACER 1022#ifdef CONFIG_SCHED_TRACER
1023static int trace_wakeup_test_thread(void *data) 1023static int trace_wakeup_test_thread(void *data)
1024{ 1024{
1025 /* Make this a RT thread, doesn't need to be too high */ 1025 /* Make this a -deadline thread */
1026 static const struct sched_param param = { .sched_priority = 5 }; 1026 static const struct sched_attr attr = {
1027 .sched_policy = SCHED_DEADLINE,
1028 .sched_runtime = 100000ULL,
1029 .sched_deadline = 10000000ULL,
1030 .sched_period = 10000000ULL
1031 };
1027 struct completion *x = data; 1032 struct completion *x = data;
1028 1033
1029 sched_setscheduler(current, SCHED_FIFO, &param); 1034 sched_setattr(current, &attr);
1030 1035
1031 /* Make it know we have a new prio */ 1036 /* Make it know we have a new prio */
1032 complete(x); 1037 complete(x);
@@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data)
1040 /* we are awake, now wait to disappear */ 1045 /* we are awake, now wait to disappear */
1041 while (!kthread_should_stop()) { 1046 while (!kthread_should_stop()) {
1042 /* 1047 /*
1043 * This is an RT task, do short sleeps to let 1048 * This will likely be the system top priority
1044 * others run. 1049 * task, do short sleeps to let others run.
1045 */ 1050 */
1046 msleep(100); 1051 msleep(100);
1047 } 1052 }
@@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1054{ 1059{
1055 unsigned long save_max = tracing_max_latency; 1060 unsigned long save_max = tracing_max_latency;
1056 struct task_struct *p; 1061 struct task_struct *p;
1057 struct completion isrt; 1062 struct completion is_ready;
1058 unsigned long count; 1063 unsigned long count;
1059 int ret; 1064 int ret;
1060 1065
1061 init_completion(&isrt); 1066 init_completion(&is_ready);
1062 1067
1063 /* create a high prio thread */ 1068 /* create a -deadline thread */
1064 p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); 1069 p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
1065 if (IS_ERR(p)) { 1070 if (IS_ERR(p)) {
1066 printk(KERN_CONT "Failed to create ftrace wakeup test thread "); 1071 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
1067 return -1; 1072 return -1;
1068 } 1073 }
1069 1074
1070 /* make sure the thread is running at an RT prio */ 1075 /* make sure the thread is running at -deadline policy */
1071 wait_for_completion(&isrt); 1076 wait_for_completion(&is_ready);
1072 1077
1073 /* start the tracing */ 1078 /* start the tracing */
1074 ret = tracer_init(trace, tr); 1079 ret = tracer_init(trace, tr);
@@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1082 1087
1083 while (p->on_rq) { 1088 while (p->on_rq) {
1084 /* 1089 /*
1085 * Sleep to make sure the RT thread is asleep too. 1090 * Sleep to make sure the -deadline thread is asleep too.
1086 * On virtual machines we can't rely on timings, 1091 * On virtual machines we can't rely on timings,
1087 * but we want to make sure this test still works. 1092 * but we want to make sure this test still works.
1088 */ 1093 */
1089 msleep(100); 1094 msleep(100);
1090 } 1095 }
1091 1096
1092 init_completion(&isrt); 1097 init_completion(&is_ready);
1093 1098
1094 wake_up_process(p); 1099 wake_up_process(p);
1095 1100
1096 /* Wait for the task to wake up */ 1101 /* Wait for the task to wake up */
1097 wait_for_completion(&isrt); 1102 wait_for_completion(&is_ready);
1098 1103
1099 /* stop the tracing. */ 1104 /* stop the tracing. */
1100 tracing_stop(); 1105 tracing_stop();
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c4638e6f0238..82de78603686 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1623,11 +1623,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1623 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 1623 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1624 !sysctl_tcp_low_latency && 1624 !sysctl_tcp_low_latency &&
1625 net_dma_find_channel()) { 1625 net_dma_find_channel()) {
1626 preempt_enable_no_resched(); 1626 preempt_enable();
1627 tp->ucopy.pinned_list = 1627 tp->ucopy.pinned_list =
1628 dma_pin_iovec_pages(msg->msg_iov, len); 1628 dma_pin_iovec_pages(msg->msg_iov, len);
1629 } else { 1629 } else {
1630 preempt_enable_no_resched(); 1630 preempt_enable();
1631 } 1631 }
1632 } 1632 }
1633#endif 1633#endif