diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 13:42:08 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 13:42:08 -0500 |
commit | a0fa1dd3cdbccec9597fe53b6177a9aa6e20f2f8 (patch) | |
tree | b249854573815eedf377e554f0ea516f86411841 | |
parent | 9326657abe1a83ed4b4f396b923ca1217fd50cba (diff) | |
parent | eaad45132c564ce377e6dce05e78e08e456d5315 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
- Add the initial implementation of SCHED_DEADLINE support: a real-time
scheduling policy where tasks that meet their deadlines and
periodically execute their instances in less than their runtime quota
see real-time scheduling and won't miss any of their deadlines.
Tasks that go over their quota get delayed (Available to privileged
users for now)
- Clean up and fix preempt_enable_no_resched() abuse all around the
tree
- Do sched_clock() performance optimizations on x86 and elsewhere
- Fix and improve auto-NUMA balancing
- Fix and clean up the idle loop
- Apply various cleanups and fixes
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
sched: Fix __sched_setscheduler() nice test
sched: Move SCHED_RESET_ON_FORK into attr::sched_flags
sched: Fix up attr::sched_priority warning
sched: Fix up scheduler syscall LTP fails
sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls
sched/core: Fix htmldocs warnings
sched/deadline: No need to check p if dl_se is valid
sched/deadline: Remove unused variables
sched/deadline: Fix sparse static warnings
m68k: Fix build warning in mac_via.h
sched, thermal: Clean up preempt_enable_no_resched() abuse
sched, net: Fixup busy_loop_us_clock()
sched, net: Clean up preempt_enable_no_resched() abuse
sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding
sched/preempt, locking: Rework local_bh_{dis,en}able()
sched/clock, x86: Avoid a runtime condition in native_sched_clock()
sched/clock: Fix up clear_sched_clock_stable()
sched/clock, x86: Use a static_key for sched_clock_stable
sched/clock: Remove local_irq_disable() from the clocks
sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs
...
63 files changed, 3775 insertions, 626 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 26b7ee491df8..6d486404200e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -428,11 +428,6 @@ rate for each task. | |||
428 | numa_balancing_scan_size_mb is how many megabytes worth of pages are | 428 | numa_balancing_scan_size_mb is how many megabytes worth of pages are |
429 | scanned for a given scan. | 429 | scanned for a given scan. |
430 | 430 | ||
431 | numa_balancing_settle_count is how many scan periods must complete before | ||
432 | the schedule balancer stops pushing the task towards a preferred node. This | ||
433 | gives the scheduler a chance to place the task on an alternative node if the | ||
434 | preferred node is overloaded. | ||
435 | |||
436 | numa_balancing_migrate_deferred is how many page migrations get skipped | 431 | numa_balancing_migrate_deferred is how many page migrations get skipped |
437 | unconditionally, after a page migration is skipped because a page is shared | 432 | unconditionally, after a page migration is skipped because a page is shared |
438 | with other tasks. This reduces page migration overhead, and determines | 433 | with other tasks. This reduces page migration overhead, and determines |
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 141baa3f9a72..acabef1a75df 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h | |||
@@ -15,7 +15,7 @@ | |||
15 | 15 | ||
16 | #include <uapi/asm/unistd.h> | 16 | #include <uapi/asm/unistd.h> |
17 | 17 | ||
18 | #define __NR_syscalls (380) | 18 | #define __NR_syscalls (384) |
19 | #define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) | 19 | #define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) |
20 | 20 | ||
21 | #define __ARCH_WANT_STAT64 | 21 | #define __ARCH_WANT_STAT64 |
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index af33b44990ed..fb5584d0cc05 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h | |||
@@ -406,6 +406,8 @@ | |||
406 | #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) | 406 | #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) |
407 | #define __NR_kcmp (__NR_SYSCALL_BASE+378) | 407 | #define __NR_kcmp (__NR_SYSCALL_BASE+378) |
408 | #define __NR_finit_module (__NR_SYSCALL_BASE+379) | 408 | #define __NR_finit_module (__NR_SYSCALL_BASE+379) |
409 | #define __NR_sched_setattr (__NR_SYSCALL_BASE+380) | ||
410 | #define __NR_sched_getattr (__NR_SYSCALL_BASE+381) | ||
409 | 411 | ||
410 | /* | 412 | /* |
411 | * This may need to be greater than __NR_last_syscall+1 in order to | 413 | * This may need to be greater than __NR_last_syscall+1 in order to |
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index c6ca7e376773..166e945de832 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S | |||
@@ -389,6 +389,8 @@ | |||
389 | CALL(sys_process_vm_writev) | 389 | CALL(sys_process_vm_writev) |
390 | CALL(sys_kcmp) | 390 | CALL(sys_kcmp) |
391 | CALL(sys_finit_module) | 391 | CALL(sys_finit_module) |
392 | /* 380 */ CALL(sys_sched_setattr) | ||
393 | CALL(sys_sched_getattr) | ||
392 | #ifndef syscalls_counted | 394 | #ifndef syscalls_counted |
393 | .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls | 395 | .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls |
394 | #define syscalls_counted | 396 | #define syscalls_counted |
diff --git a/arch/m68k/include/asm/mac_via.h b/arch/m68k/include/asm/mac_via.h index aeeedf8b2d25..fe3fc9ae1b69 100644 --- a/arch/m68k/include/asm/mac_via.h +++ b/arch/m68k/include/asm/mac_via.h | |||
@@ -254,6 +254,8 @@ | |||
254 | extern volatile __u8 *via1,*via2; | 254 | extern volatile __u8 *via1,*via2; |
255 | extern int rbv_present,via_alt_mapping; | 255 | extern int rbv_present,via_alt_mapping; |
256 | 256 | ||
257 | struct irq_desc; | ||
258 | |||
257 | extern void via_register_interrupts(void); | 259 | extern void via_register_interrupts(void); |
258 | extern void via_irq_enable(int); | 260 | extern void via_irq_enable(int); |
259 | extern void via_irq_disable(int); | 261 | extern void via_irq_disable(int); |
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0ac6b4..1da25a5f96f9 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_X86_MWAIT_H | 1 | #ifndef _ASM_X86_MWAIT_H |
2 | #define _ASM_X86_MWAIT_H | 2 | #define _ASM_X86_MWAIT_H |
3 | 3 | ||
4 | #include <linux/sched.h> | ||
5 | |||
4 | #define MWAIT_SUBSTATE_MASK 0xf | 6 | #define MWAIT_SUBSTATE_MASK 0xf |
5 | #define MWAIT_CSTATE_MASK 0xf | 7 | #define MWAIT_CSTATE_MASK 0xf |
6 | #define MWAIT_SUBSTATE_SIZE 4 | 8 | #define MWAIT_SUBSTATE_SIZE 4 |
@@ -13,4 +15,45 @@ | |||
13 | 15 | ||
14 | #define MWAIT_ECX_INTERRUPT_BREAK 0x1 | 16 | #define MWAIT_ECX_INTERRUPT_BREAK 0x1 |
15 | 17 | ||
18 | static inline void __monitor(const void *eax, unsigned long ecx, | ||
19 | unsigned long edx) | ||
20 | { | ||
21 | /* "monitor %eax, %ecx, %edx;" */ | ||
22 | asm volatile(".byte 0x0f, 0x01, 0xc8;" | ||
23 | :: "a" (eax), "c" (ecx), "d"(edx)); | ||
24 | } | ||
25 | |||
26 | static inline void __mwait(unsigned long eax, unsigned long ecx) | ||
27 | { | ||
28 | /* "mwait %eax, %ecx;" */ | ||
29 | asm volatile(".byte 0x0f, 0x01, 0xc9;" | ||
30 | :: "a" (eax), "c" (ecx)); | ||
31 | } | ||
32 | |||
33 | /* | ||
34 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
35 | * which can obviate IPI to trigger checking of need_resched. | ||
36 | * We execute MONITOR against need_resched and enter optimized wait state | ||
37 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
38 | * up from MWAIT (without an IPI). | ||
39 | * | ||
40 | * New with Core Duo processors, MWAIT can take some hints based on CPU | ||
41 | * capability. | ||
42 | */ | ||
43 | static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) | ||
44 | { | ||
45 | if (!current_set_polling_and_test()) { | ||
46 | if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { | ||
47 | mb(); | ||
48 | clflush((void *)¤t_thread_info()->flags); | ||
49 | mb(); | ||
50 | } | ||
51 | |||
52 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
53 | if (!need_resched()) | ||
54 | __mwait(eax, ecx); | ||
55 | } | ||
56 | current_clr_polling(); | ||
57 | } | ||
58 | |||
16 | #endif /* _ASM_X86_MWAIT_H */ | 59 | #endif /* _ASM_X86_MWAIT_H */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4057f9..24821f5768bc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -700,29 +700,6 @@ static inline void sync_core(void) | |||
700 | #endif | 700 | #endif |
701 | } | 701 | } |
702 | 702 | ||
703 | static inline void __monitor(const void *eax, unsigned long ecx, | ||
704 | unsigned long edx) | ||
705 | { | ||
706 | /* "monitor %eax, %ecx, %edx;" */ | ||
707 | asm volatile(".byte 0x0f, 0x01, 0xc8;" | ||
708 | :: "a" (eax), "c" (ecx), "d"(edx)); | ||
709 | } | ||
710 | |||
711 | static inline void __mwait(unsigned long eax, unsigned long ecx) | ||
712 | { | ||
713 | /* "mwait %eax, %ecx;" */ | ||
714 | asm volatile(".byte 0x0f, 0x01, 0xc9;" | ||
715 | :: "a" (eax), "c" (ecx)); | ||
716 | } | ||
717 | |||
718 | static inline void __sti_mwait(unsigned long eax, unsigned long ecx) | ||
719 | { | ||
720 | trace_hardirqs_on(); | ||
721 | /* "mwait %eax, %ecx;" */ | ||
722 | asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" | ||
723 | :: "a" (eax), "c" (ecx)); | ||
724 | } | ||
725 | |||
726 | extern void select_idle_routine(const struct cpuinfo_x86 *c); | 703 | extern void select_idle_routine(const struct cpuinfo_x86 *c); |
727 | extern void init_amd_e400_c1e_mask(void); | 704 | extern void init_amd_e400_c1e_mask(void); |
728 | 705 | ||
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0eb5d0c..3de54ef0aea5 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/pm.h> | 4 | #include <linux/pm.h> |
5 | #include <linux/percpu.h> | 5 | #include <linux/percpu.h> |
6 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
7 | #include <linux/math64.h> | ||
7 | 8 | ||
8 | #define TICK_SIZE (tick_nsec / 1000) | 9 | #define TICK_SIZE (tick_nsec / 1000) |
9 | 10 | ||
@@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void); | |||
12 | 13 | ||
13 | extern int no_timer_check; | 14 | extern int no_timer_check; |
14 | 15 | ||
15 | /* Accelerators for sched_clock() | 16 | /* |
16 | * convert from cycles(64bits) => nanoseconds (64bits) | 17 | * We use the full linear equation: f(x) = a + b*x, in order to allow |
17 | * basic equation: | 18 | * a continuous function in the face of dynamic freq changes. |
18 | * ns = cycles / (freq / ns_per_sec) | ||
19 | * ns = cycles * (ns_per_sec / freq) | ||
20 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
21 | * ns = cycles * (10^6 / cpu_khz) | ||
22 | * | 19 | * |
23 | * Then we use scaling math (suggested by george@mvista.com) to get: | 20 | * Continuity means that when our frequency changes our slope (b); we want to |
24 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | 21 | * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. |
25 | * ns = cycles * cyc2ns_scale / SC | ||
26 | * | 22 | * |
27 | * And since SC is a constant power of two, we can convert the div | 23 | * Without an offset (a) the above would not be possible. |
28 | * into a shift. | ||
29 | * | 24 | * |
30 | * We can use khz divisor instead of mhz to keep a better precision, since | 25 | * See the comment near cycles_2_ns() for details on how we compute (b). |
31 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
32 | * (mathieu.desnoyers@polymtl.ca) | ||
33 | * | ||
34 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
35 | * | ||
36 | * In: | ||
37 | * | ||
38 | * ns = cycles * cyc2ns_scale / SC | ||
39 | * | ||
40 | * Although we may still have enough bits to store the value of ns, | ||
41 | * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, | ||
42 | * leading to an incorrect result. | ||
43 | * | ||
44 | * To avoid this, we can decompose 'cycles' into quotient and remainder | ||
45 | * of division by SC. Then, | ||
46 | * | ||
47 | * ns = (quot * SC + rem) * cyc2ns_scale / SC | ||
48 | * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC | ||
49 | * | ||
50 | * - sqazi@google.com | ||
51 | */ | 26 | */ |
52 | 27 | struct cyc2ns_data { | |
53 | DECLARE_PER_CPU(unsigned long, cyc2ns); | 28 | u32 cyc2ns_mul; |
54 | DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); | 29 | u32 cyc2ns_shift; |
55 | 30 | u64 cyc2ns_offset; | |
56 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | 31 | u32 __count; |
57 | 32 | /* u32 hole */ | |
58 | static inline unsigned long long __cycles_2_ns(unsigned long long cyc) | 33 | }; /* 24 bytes -- do not grow */ |
59 | { | 34 | |
60 | int cpu = smp_processor_id(); | 35 | extern struct cyc2ns_data *cyc2ns_read_begin(void); |
61 | unsigned long long ns = per_cpu(cyc2ns_offset, cpu); | 36 | extern void cyc2ns_read_end(struct cyc2ns_data *); |
62 | ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), | ||
63 | (1UL << CYC2NS_SCALE_FACTOR)); | ||
64 | return ns; | ||
65 | } | ||
66 | |||
67 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
68 | { | ||
69 | unsigned long long ns; | ||
70 | unsigned long flags; | ||
71 | |||
72 | local_irq_save(flags); | ||
73 | ns = __cycles_2_ns(cyc); | ||
74 | local_irq_restore(flags); | ||
75 | |||
76 | return ns; | ||
77 | } | ||
78 | 37 | ||
79 | #endif /* _ASM_X86_TIMER_H */ | 38 | #endif /* _ASM_X86_TIMER_H */ |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27781bc..e69182fd01cf 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, | |||
150 | } | 150 | } |
151 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); | 151 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); |
152 | 152 | ||
153 | /* | ||
154 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
155 | * which can obviate IPI to trigger checking of need_resched. | ||
156 | * We execute MONITOR against need_resched and enter optimized wait state | ||
157 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
158 | * up from MWAIT (without an IPI). | ||
159 | * | ||
160 | * New with Core Duo processors, MWAIT can take some hints based on CPU | ||
161 | * capability. | ||
162 | */ | ||
163 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | ||
164 | { | ||
165 | if (!need_resched()) { | ||
166 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) | ||
167 | clflush((void *)¤t_thread_info()->flags); | ||
168 | |||
169 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
170 | smp_mb(); | ||
171 | if (!need_resched()) | ||
172 | __mwait(ax, cx); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) | 153 | void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) |
177 | { | 154 | { |
178 | unsigned int cpu = smp_processor_id(); | 155 | unsigned int cpu = smp_processor_id(); |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023bdd6b2..8bc79cddd9a2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) | |||
487 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 487 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
488 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 488 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
489 | if (!check_tsc_unstable()) | 489 | if (!check_tsc_unstable()) |
490 | sched_clock_stable = 1; | 490 | set_sched_clock_stable(); |
491 | } | 491 | } |
492 | 492 | ||
493 | #ifdef CONFIG_X86_64 | 493 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b342c026..1a439c047ff3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) | |||
93 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 93 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
94 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 94 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
95 | if (!check_tsc_unstable()) | 95 | if (!check_tsc_unstable()) |
96 | sched_clock_stable = 1; | 96 | set_sched_clock_stable(); |
97 | } | 97 | } |
98 | 98 | ||
99 | /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ | 99 | /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e132931614d..b88645191fe5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -1883,21 +1883,27 @@ static struct pmu pmu = { | |||
1883 | 1883 | ||
1884 | void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) | 1884 | void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) |
1885 | { | 1885 | { |
1886 | struct cyc2ns_data *data; | ||
1887 | |||
1886 | userpg->cap_user_time = 0; | 1888 | userpg->cap_user_time = 0; |
1887 | userpg->cap_user_time_zero = 0; | 1889 | userpg->cap_user_time_zero = 0; |
1888 | userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; | 1890 | userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; |
1889 | userpg->pmc_width = x86_pmu.cntval_bits; | 1891 | userpg->pmc_width = x86_pmu.cntval_bits; |
1890 | 1892 | ||
1891 | if (!sched_clock_stable) | 1893 | if (!sched_clock_stable()) |
1892 | return; | 1894 | return; |
1893 | 1895 | ||
1896 | data = cyc2ns_read_begin(); | ||
1897 | |||
1894 | userpg->cap_user_time = 1; | 1898 | userpg->cap_user_time = 1; |
1895 | userpg->time_mult = this_cpu_read(cyc2ns); | 1899 | userpg->time_mult = data->cyc2ns_mul; |
1896 | userpg->time_shift = CYC2NS_SCALE_FACTOR; | 1900 | userpg->time_shift = data->cyc2ns_shift; |
1897 | userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; | 1901 | userpg->time_offset = data->cyc2ns_offset - now; |
1898 | 1902 | ||
1899 | userpg->cap_user_time_zero = 1; | 1903 | userpg->cap_user_time_zero = 1; |
1900 | userpg->time_zero = this_cpu_read(cyc2ns_offset); | 1904 | userpg->time_zero = data->cyc2ns_offset; |
1905 | |||
1906 | cyc2ns_read_end(data); | ||
1901 | } | 1907 | } |
1902 | 1908 | ||
1903 | /* | 1909 | /* |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a3aa02..f5252c4eec8c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void) | |||
1417 | * The WBINVD is insufficient due to the spurious-wakeup | 1417 | * The WBINVD is insufficient due to the spurious-wakeup |
1418 | * case where we return around the loop. | 1418 | * case where we return around the loop. |
1419 | */ | 1419 | */ |
1420 | mb(); | ||
1420 | clflush(mwait_ptr); | 1421 | clflush(mwait_ptr); |
1422 | mb(); | ||
1421 | __monitor(mwait_ptr, 0, 0); | 1423 | __monitor(mwait_ptr, 0, 0); |
1422 | mb(); | 1424 | mb(); |
1423 | __mwait(eax, 0); | 1425 | __mwait(eax, 0); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f560..6377fb28b958 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/clocksource.h> | 11 | #include <linux/clocksource.h> |
12 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
13 | #include <linux/timex.h> | 13 | #include <linux/timex.h> |
14 | #include <linux/static_key.h> | ||
14 | 15 | ||
15 | #include <asm/hpet.h> | 16 | #include <asm/hpet.h> |
16 | #include <asm/timer.h> | 17 | #include <asm/timer.h> |
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable; | |||
37 | erroneous rdtsc usage on !cpu_has_tsc processors */ | 38 | erroneous rdtsc usage on !cpu_has_tsc processors */ |
38 | static int __read_mostly tsc_disabled = -1; | 39 | static int __read_mostly tsc_disabled = -1; |
39 | 40 | ||
41 | static struct static_key __use_tsc = STATIC_KEY_INIT; | ||
42 | |||
40 | int tsc_clocksource_reliable; | 43 | int tsc_clocksource_reliable; |
44 | |||
45 | /* | ||
46 | * Use a ring-buffer like data structure, where a writer advances the head by | ||
47 | * writing a new data entry and a reader advances the tail when it observes a | ||
48 | * new entry. | ||
49 | * | ||
50 | * Writers are made to wait on readers until there's space to write a new | ||
51 | * entry. | ||
52 | * | ||
53 | * This means that we can always use an {offset, mul} pair to compute a ns | ||
54 | * value that is 'roughly' in the right direction, even if we're writing a new | ||
55 | * {offset, mul} pair during the clock read. | ||
56 | * | ||
57 | * The down-side is that we can no longer guarantee strict monotonicity anymore | ||
58 | * (assuming the TSC was that to begin with), because while we compute the | ||
59 | * intersection point of the two clock slopes and make sure the time is | ||
60 | * continuous at the point of switching; we can no longer guarantee a reader is | ||
61 | * strictly before or after the switch point. | ||
62 | * | ||
63 | * It does mean a reader no longer needs to disable IRQs in order to avoid | ||
64 | * CPU-Freq updates messing with his times, and similarly an NMI reader will | ||
65 | * no longer run the risk of hitting half-written state. | ||
66 | */ | ||
67 | |||
68 | struct cyc2ns { | ||
69 | struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ | ||
70 | struct cyc2ns_data *head; /* 48 + 8 = 56 */ | ||
71 | struct cyc2ns_data *tail; /* 56 + 8 = 64 */ | ||
72 | }; /* exactly fits one cacheline */ | ||
73 | |||
74 | static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); | ||
75 | |||
76 | struct cyc2ns_data *cyc2ns_read_begin(void) | ||
77 | { | ||
78 | struct cyc2ns_data *head; | ||
79 | |||
80 | preempt_disable(); | ||
81 | |||
82 | head = this_cpu_read(cyc2ns.head); | ||
83 | /* | ||
84 | * Ensure we observe the entry when we observe the pointer to it. | ||
85 | * matches the wmb from cyc2ns_write_end(). | ||
86 | */ | ||
87 | smp_read_barrier_depends(); | ||
88 | head->__count++; | ||
89 | barrier(); | ||
90 | |||
91 | return head; | ||
92 | } | ||
93 | |||
94 | void cyc2ns_read_end(struct cyc2ns_data *head) | ||
95 | { | ||
96 | barrier(); | ||
97 | /* | ||
98 | * If we're the outer most nested read; update the tail pointer | ||
99 | * when we're done. This notifies possible pending writers | ||
100 | * that we've observed the head pointer and that the other | ||
101 | * entry is now free. | ||
102 | */ | ||
103 | if (!--head->__count) { | ||
104 | /* | ||
105 | * x86-TSO does not reorder writes with older reads; | ||
106 | * therefore once this write becomes visible to another | ||
107 | * cpu, we must be finished reading the cyc2ns_data. | ||
108 | * | ||
109 | * matches with cyc2ns_write_begin(). | ||
110 | */ | ||
111 | this_cpu_write(cyc2ns.tail, head); | ||
112 | } | ||
113 | preempt_enable(); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * Begin writing a new @data entry for @cpu. | ||
118 | * | ||
119 | * Assumes some sort of write side lock; currently 'provided' by the assumption | ||
120 | * that cpufreq will call its notifiers sequentially. | ||
121 | */ | ||
122 | static struct cyc2ns_data *cyc2ns_write_begin(int cpu) | ||
123 | { | ||
124 | struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); | ||
125 | struct cyc2ns_data *data = c2n->data; | ||
126 | |||
127 | if (data == c2n->head) | ||
128 | data++; | ||
129 | |||
130 | /* XXX send an IPI to @cpu in order to guarantee a read? */ | ||
131 | |||
132 | /* | ||
133 | * When we observe the tail write from cyc2ns_read_end(), | ||
134 | * the cpu must be done with that entry and its safe | ||
135 | * to start writing to it. | ||
136 | */ | ||
137 | while (c2n->tail == data) | ||
138 | cpu_relax(); | ||
139 | |||
140 | return data; | ||
141 | } | ||
142 | |||
143 | static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) | ||
144 | { | ||
145 | struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); | ||
146 | |||
147 | /* | ||
148 | * Ensure the @data writes are visible before we publish the | ||
149 | * entry. Matches the data-depencency in cyc2ns_read_begin(). | ||
150 | */ | ||
151 | smp_wmb(); | ||
152 | |||
153 | ACCESS_ONCE(c2n->head) = data; | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * Accelerators for sched_clock() | ||
158 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
159 | * basic equation: | ||
160 | * ns = cycles / (freq / ns_per_sec) | ||
161 | * ns = cycles * (ns_per_sec / freq) | ||
162 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
163 | * ns = cycles * (10^6 / cpu_khz) | ||
164 | * | ||
165 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
166 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
167 | * ns = cycles * cyc2ns_scale / SC | ||
168 | * | ||
169 | * And since SC is a constant power of two, we can convert the div | ||
170 | * into a shift. | ||
171 | * | ||
172 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
173 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
174 | * (mathieu.desnoyers@polymtl.ca) | ||
175 | * | ||
176 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
177 | */ | ||
178 | |||
179 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | ||
180 | |||
181 | static void cyc2ns_data_init(struct cyc2ns_data *data) | ||
182 | { | ||
183 | data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; | ||
184 | data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; | ||
185 | data->cyc2ns_offset = 0; | ||
186 | data->__count = 0; | ||
187 | } | ||
188 | |||
189 | static void cyc2ns_init(int cpu) | ||
190 | { | ||
191 | struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); | ||
192 | |||
193 | cyc2ns_data_init(&c2n->data[0]); | ||
194 | cyc2ns_data_init(&c2n->data[1]); | ||
195 | |||
196 | c2n->head = c2n->data; | ||
197 | c2n->tail = c2n->data; | ||
198 | } | ||
199 | |||
200 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
201 | { | ||
202 | struct cyc2ns_data *data, *tail; | ||
203 | unsigned long long ns; | ||
204 | |||
205 | /* | ||
206 | * See cyc2ns_read_*() for details; replicated in order to avoid | ||
207 | * an extra few instructions that came with the abstraction. | ||
208 | * Notable, it allows us to only do the __count and tail update | ||
209 | * dance when its actually needed. | ||
210 | */ | ||
211 | |||
212 | preempt_disable(); | ||
213 | data = this_cpu_read(cyc2ns.head); | ||
214 | tail = this_cpu_read(cyc2ns.tail); | ||
215 | |||
216 | if (likely(data == tail)) { | ||
217 | ns = data->cyc2ns_offset; | ||
218 | ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); | ||
219 | } else { | ||
220 | data->__count++; | ||
221 | |||
222 | barrier(); | ||
223 | |||
224 | ns = data->cyc2ns_offset; | ||
225 | ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); | ||
226 | |||
227 | barrier(); | ||
228 | |||
229 | if (!--data->__count) | ||
230 | this_cpu_write(cyc2ns.tail, data); | ||
231 | } | ||
232 | preempt_enable(); | ||
233 | |||
234 | return ns; | ||
235 | } | ||
236 | |||
237 | /* XXX surely we already have this someplace in the kernel?! */ | ||
238 | #define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) | ||
239 | |||
240 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
241 | { | ||
242 | unsigned long long tsc_now, ns_now; | ||
243 | struct cyc2ns_data *data; | ||
244 | unsigned long flags; | ||
245 | |||
246 | local_irq_save(flags); | ||
247 | sched_clock_idle_sleep_event(); | ||
248 | |||
249 | if (!cpu_khz) | ||
250 | goto done; | ||
251 | |||
252 | data = cyc2ns_write_begin(cpu); | ||
253 | |||
254 | rdtscll(tsc_now); | ||
255 | ns_now = cycles_2_ns(tsc_now); | ||
256 | |||
257 | /* | ||
258 | * Compute a new multiplier as per the above comment and ensure our | ||
259 | * time function is continuous; see the comment near struct | ||
260 | * cyc2ns_data. | ||
261 | */ | ||
262 | data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); | ||
263 | data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; | ||
264 | data->cyc2ns_offset = ns_now - | ||
265 | mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); | ||
266 | |||
267 | cyc2ns_write_end(cpu, data); | ||
268 | |||
269 | done: | ||
270 | sched_clock_idle_wakeup_event(0); | ||
271 | local_irq_restore(flags); | ||
272 | } | ||
41 | /* | 273 | /* |
42 | * Scheduler clock - returns current time in nanosec units. | 274 | * Scheduler clock - returns current time in nanosec units. |
43 | */ | 275 | */ |
44 | u64 native_sched_clock(void) | 276 | u64 native_sched_clock(void) |
45 | { | 277 | { |
46 | u64 this_offset; | 278 | u64 tsc_now; |
47 | 279 | ||
48 | /* | 280 | /* |
49 | * Fall back to jiffies if there's no TSC available: | 281 | * Fall back to jiffies if there's no TSC available: |
@@ -53,16 +285,16 @@ u64 native_sched_clock(void) | |||
53 | * very important for it to be as fast as the platform | 285 | * very important for it to be as fast as the platform |
54 | * can achieve it. ) | 286 | * can achieve it. ) |
55 | */ | 287 | */ |
56 | if (unlikely(tsc_disabled)) { | 288 | if (!static_key_false(&__use_tsc)) { |
57 | /* No locking but a rare wrong value is not a big deal: */ | 289 | /* No locking but a rare wrong value is not a big deal: */ |
58 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); | 290 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); |
59 | } | 291 | } |
60 | 292 | ||
61 | /* read the Time Stamp Counter: */ | 293 | /* read the Time Stamp Counter: */ |
62 | rdtscll(this_offset); | 294 | rdtscll(tsc_now); |
63 | 295 | ||
64 | /* return the value in ns */ | 296 | /* return the value in ns */ |
65 | return __cycles_2_ns(this_offset); | 297 | return cycles_2_ns(tsc_now); |
66 | } | 298 | } |
67 | 299 | ||
68 | /* We need to define a real function for sched_clock, to override the | 300 | /* We need to define a real function for sched_clock, to override the |
@@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void) | |||
589 | EXPORT_SYMBOL(recalibrate_cpu_khz); | 821 | EXPORT_SYMBOL(recalibrate_cpu_khz); |
590 | 822 | ||
591 | 823 | ||
592 | /* Accelerators for sched_clock() | ||
593 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
594 | * basic equation: | ||
595 | * ns = cycles / (freq / ns_per_sec) | ||
596 | * ns = cycles * (ns_per_sec / freq) | ||
597 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
598 | * ns = cycles * (10^6 / cpu_khz) | ||
599 | * | ||
600 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
601 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
602 | * ns = cycles * cyc2ns_scale / SC | ||
603 | * | ||
604 | * And since SC is a constant power of two, we can convert the div | ||
605 | * into a shift. | ||
606 | * | ||
607 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
608 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
609 | * (mathieu.desnoyers@polymtl.ca) | ||
610 | * | ||
611 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
612 | */ | ||
613 | |||
614 | DEFINE_PER_CPU(unsigned long, cyc2ns); | ||
615 | DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); | ||
616 | |||
617 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
618 | { | ||
619 | unsigned long long tsc_now, ns_now, *offset; | ||
620 | unsigned long flags, *scale; | ||
621 | |||
622 | local_irq_save(flags); | ||
623 | sched_clock_idle_sleep_event(); | ||
624 | |||
625 | scale = &per_cpu(cyc2ns, cpu); | ||
626 | offset = &per_cpu(cyc2ns_offset, cpu); | ||
627 | |||
628 | rdtscll(tsc_now); | ||
629 | ns_now = __cycles_2_ns(tsc_now); | ||
630 | |||
631 | if (cpu_khz) { | ||
632 | *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + | ||
633 | cpu_khz / 2) / cpu_khz; | ||
634 | *offset = ns_now - mult_frac(tsc_now, *scale, | ||
635 | (1UL << CYC2NS_SCALE_FACTOR)); | ||
636 | } | ||
637 | |||
638 | sched_clock_idle_wakeup_event(0); | ||
639 | local_irq_restore(flags); | ||
640 | } | ||
641 | |||
642 | static unsigned long long cyc2ns_suspend; | 824 | static unsigned long long cyc2ns_suspend; |
643 | 825 | ||
644 | void tsc_save_sched_clock_state(void) | 826 | void tsc_save_sched_clock_state(void) |
645 | { | 827 | { |
646 | if (!sched_clock_stable) | 828 | if (!sched_clock_stable()) |
647 | return; | 829 | return; |
648 | 830 | ||
649 | cyc2ns_suspend = sched_clock(); | 831 | cyc2ns_suspend = sched_clock(); |
@@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void) | |||
663 | unsigned long flags; | 845 | unsigned long flags; |
664 | int cpu; | 846 | int cpu; |
665 | 847 | ||
666 | if (!sched_clock_stable) | 848 | if (!sched_clock_stable()) |
667 | return; | 849 | return; |
668 | 850 | ||
669 | local_irq_save(flags); | 851 | local_irq_save(flags); |
670 | 852 | ||
671 | __this_cpu_write(cyc2ns_offset, 0); | 853 | /* |
854 | * We're comming out of suspend, there's no concurrency yet; don't | ||
855 | * bother being nice about the RCU stuff, just write to both | ||
856 | * data fields. | ||
857 | */ | ||
858 | |||
859 | this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); | ||
860 | this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); | ||
861 | |||
672 | offset = cyc2ns_suspend - sched_clock(); | 862 | offset = cyc2ns_suspend - sched_clock(); |
673 | 863 | ||
674 | for_each_possible_cpu(cpu) | 864 | for_each_possible_cpu(cpu) { |
675 | per_cpu(cyc2ns_offset, cpu) = offset; | 865 | per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; |
866 | per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; | ||
867 | } | ||
676 | 868 | ||
677 | local_irq_restore(flags); | 869 | local_irq_restore(flags); |
678 | } | 870 | } |
@@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason) | |||
795 | { | 987 | { |
796 | if (!tsc_unstable) { | 988 | if (!tsc_unstable) { |
797 | tsc_unstable = 1; | 989 | tsc_unstable = 1; |
798 | sched_clock_stable = 0; | 990 | clear_sched_clock_stable(); |
799 | disable_sched_clock_irqtime(); | 991 | disable_sched_clock_irqtime(); |
800 | pr_info("Marking TSC unstable due to %s\n", reason); | 992 | pr_info("Marking TSC unstable due to %s\n", reason); |
801 | /* Change only the rating, when not registered */ | 993 | /* Change only the rating, when not registered */ |
@@ -995,14 +1187,18 @@ void __init tsc_init(void) | |||
995 | * speed as the bootup CPU. (cpufreq notifiers will fix this | 1187 | * speed as the bootup CPU. (cpufreq notifiers will fix this |
996 | * up if their speed diverges) | 1188 | * up if their speed diverges) |
997 | */ | 1189 | */ |
998 | for_each_possible_cpu(cpu) | 1190 | for_each_possible_cpu(cpu) { |
1191 | cyc2ns_init(cpu); | ||
999 | set_cyc2ns_scale(cpu_khz, cpu); | 1192 | set_cyc2ns_scale(cpu_khz, cpu); |
1193 | } | ||
1000 | 1194 | ||
1001 | if (tsc_disabled > 0) | 1195 | if (tsc_disabled > 0) |
1002 | return; | 1196 | return; |
1003 | 1197 | ||
1004 | /* now allow native_sched_clock() to use rdtsc */ | 1198 | /* now allow native_sched_clock() to use rdtsc */ |
1199 | |||
1005 | tsc_disabled = 0; | 1200 | tsc_disabled = 0; |
1201 | static_key_slow_inc(&__use_tsc); | ||
1006 | 1202 | ||
1007 | if (!no_sched_irq_time) | 1203 | if (!no_sched_irq_time) |
1008 | enable_sched_clock_irqtime(); | 1204 | enable_sched_clock_irqtime(); |
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index efe4d7220397..dfe605ac1bcd 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c | |||
@@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) | |||
433 | return; | 433 | return; |
434 | } | 434 | } |
435 | 435 | ||
436 | static inline unsigned long cycles_2_us(unsigned long long cyc) | 436 | /* |
437 | * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative | ||
438 | * number, not an absolute. It converts a duration in cycles to a duration in | ||
439 | * ns. | ||
440 | */ | ||
441 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
437 | { | 442 | { |
443 | struct cyc2ns_data *data = cyc2ns_read_begin(); | ||
438 | unsigned long long ns; | 444 | unsigned long long ns; |
439 | unsigned long us; | ||
440 | int cpu = smp_processor_id(); | ||
441 | 445 | ||
442 | ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; | 446 | ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); |
443 | us = ns / 1000; | 447 | |
444 | return us; | 448 | cyc2ns_read_end(data); |
449 | return ns; | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * The reverse of the above; converts a duration in ns to a duration in cycles. | ||
454 | */ | ||
455 | static inline unsigned long long ns_2_cycles(unsigned long long ns) | ||
456 | { | ||
457 | struct cyc2ns_data *data = cyc2ns_read_begin(); | ||
458 | unsigned long long cyc; | ||
459 | |||
460 | cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; | ||
461 | |||
462 | cyc2ns_read_end(data); | ||
463 | return cyc; | ||
464 | } | ||
465 | |||
466 | static inline unsigned long cycles_2_us(unsigned long long cyc) | ||
467 | { | ||
468 | return cycles_2_ns(cyc) / NSEC_PER_USEC; | ||
469 | } | ||
470 | |||
471 | static inline cycles_t sec_2_cycles(unsigned long sec) | ||
472 | { | ||
473 | return ns_2_cycles(sec * NSEC_PER_SEC); | ||
474 | } | ||
475 | |||
476 | static inline unsigned long long usec_2_cycles(unsigned long usec) | ||
477 | { | ||
478 | return ns_2_cycles(usec * NSEC_PER_USEC); | ||
445 | } | 479 | } |
446 | 480 | ||
447 | /* | 481 | /* |
@@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, | |||
668 | bcp, try); | 702 | bcp, try); |
669 | } | 703 | } |
670 | 704 | ||
671 | static inline cycles_t sec_2_cycles(unsigned long sec) | ||
672 | { | ||
673 | unsigned long ns; | ||
674 | cycles_t cyc; | ||
675 | |||
676 | ns = sec * 1000000000; | ||
677 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); | ||
678 | return cyc; | ||
679 | } | ||
680 | |||
681 | /* | 705 | /* |
682 | * Our retries are blocked by all destination sw ack resources being | 706 | * Our retries are blocked by all destination sw ack resources being |
683 | * in use, and a timeout is pending. In that case hardware immediately | 707 | * in use, and a timeout is pending. In that case hardware immediately |
@@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) | |||
1327 | { | 1351 | { |
1328 | } | 1352 | } |
1329 | 1353 | ||
1330 | static inline unsigned long long usec_2_cycles(unsigned long microsec) | ||
1331 | { | ||
1332 | unsigned long ns; | ||
1333 | unsigned long long cyc; | ||
1334 | |||
1335 | ns = microsec * 1000; | ||
1336 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); | ||
1337 | return cyc; | ||
1338 | } | ||
1339 | |||
1340 | /* | 1354 | /* |
1341 | * Display the statistics thru /proc/sgi_uv/ptc_statistics | 1355 | * Display the statistics thru /proc/sgi_uv/ptc_statistics |
1342 | * 'data' points to the cpu number | 1356 | * 'data' points to the cpu number |
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index aabfb8380a1c..96bc506ac6de 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl | |||
@@ -357,3 +357,5 @@ | |||
357 | 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev | 357 | 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev |
358 | 349 i386 kcmp sys_kcmp | 358 | 349 i386 kcmp sys_kcmp |
359 | 350 i386 finit_module sys_finit_module | 359 | 350 i386 finit_module sys_finit_module |
360 | 351 i386 sched_setattr sys_sched_setattr | ||
361 | 352 i386 sched_getattr sys_sched_getattr | ||
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65dfd14f..a12bddc7ccea 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl | |||
@@ -320,6 +320,8 @@ | |||
320 | 311 64 process_vm_writev sys_process_vm_writev | 320 | 311 64 process_vm_writev sys_process_vm_writev |
321 | 312 common kcmp sys_kcmp | 321 | 312 common kcmp sys_kcmp |
322 | 313 common finit_module sys_finit_module | 322 | 313 common finit_module sys_finit_module |
323 | 314 common sched_setattr sys_sched_setattr | ||
324 | 315 common sched_getattr sys_sched_getattr | ||
323 | 325 | ||
324 | # | 326 | # |
325 | # x32-specific system call numbers start at 512 to avoid cache impact | 327 | # x32-specific system call numbers start at 512 to avoid cache impact |
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index fc6008fbce35..509452a62f96 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c | |||
@@ -193,10 +193,7 @@ static int power_saving_thread(void *data) | |||
193 | CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 193 | CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); |
194 | stop_critical_timings(); | 194 | stop_critical_timings(); |
195 | 195 | ||
196 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 196 | mwait_idle_with_hints(power_saving_mwait_eax, 1); |
197 | smp_mb(); | ||
198 | if (!need_resched()) | ||
199 | __mwait(power_saving_mwait_eax, 1); | ||
200 | 197 | ||
201 | start_critical_timings(); | 198 | start_critical_timings(); |
202 | if (lapic_marked_unstable) | 199 | if (lapic_marked_unstable) |
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 644516d9bde6..f90c56c8379e 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c | |||
@@ -727,11 +727,6 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, | |||
727 | if (unlikely(!pr)) | 727 | if (unlikely(!pr)) |
728 | return -EINVAL; | 728 | return -EINVAL; |
729 | 729 | ||
730 | if (cx->entry_method == ACPI_CSTATE_FFH) { | ||
731 | if (current_set_polling_and_test()) | ||
732 | return -EINVAL; | ||
733 | } | ||
734 | |||
735 | lapic_timer_state_broadcast(pr, cx, 1); | 730 | lapic_timer_state_broadcast(pr, cx, 1); |
736 | acpi_idle_do_entry(cx); | 731 | acpi_idle_do_entry(cx); |
737 | 732 | ||
@@ -785,11 +780,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, | |||
785 | if (unlikely(!pr)) | 780 | if (unlikely(!pr)) |
786 | return -EINVAL; | 781 | return -EINVAL; |
787 | 782 | ||
788 | if (cx->entry_method == ACPI_CSTATE_FFH) { | ||
789 | if (current_set_polling_and_test()) | ||
790 | return -EINVAL; | ||
791 | } | ||
792 | |||
793 | /* | 783 | /* |
794 | * Must be done before busmaster disable as we might need to | 784 | * Must be done before busmaster disable as we might need to |
795 | * access HPET ! | 785 | * access HPET ! |
@@ -841,11 +831,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, | |||
841 | } | 831 | } |
842 | } | 832 | } |
843 | 833 | ||
844 | if (cx->entry_method == ACPI_CSTATE_FFH) { | ||
845 | if (current_set_polling_and_test()) | ||
846 | return -EINVAL; | ||
847 | } | ||
848 | |||
849 | acpi_unlazy_tlb(smp_processor_id()); | 834 | acpi_unlazy_tlb(smp_processor_id()); |
850 | 835 | ||
851 | /* Tell the scheduler that we are going deep-idle: */ | 836 | /* Tell the scheduler that we are going deep-idle: */ |
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 797ed29a36ea..6c0e0452dd9b 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c | |||
@@ -377,16 +377,7 @@ static int intel_idle(struct cpuidle_device *dev, | |||
377 | if (!(lapic_timer_reliable_states & (1 << (cstate)))) | 377 | if (!(lapic_timer_reliable_states & (1 << (cstate)))) |
378 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 378 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); |
379 | 379 | ||
380 | if (!current_set_polling_and_test()) { | 380 | mwait_idle_with_hints(eax, ecx); |
381 | |||
382 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) | ||
383 | clflush((void *)¤t_thread_info()->flags); | ||
384 | |||
385 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
386 | smp_mb(); | ||
387 | if (!need_resched()) | ||
388 | __mwait(eax, ecx); | ||
389 | } | ||
390 | 381 | ||
391 | if (!(lapic_timer_reliable_states & (1 << (cstate)))) | 382 | if (!(lapic_timer_reliable_states & (1 << (cstate)))) |
392 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); | 383 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); |
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 8f181b3f842b..d833c8f5b465 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c | |||
@@ -438,14 +438,12 @@ static int clamp_thread(void *arg) | |||
438 | */ | 438 | */ |
439 | local_touch_nmi(); | 439 | local_touch_nmi(); |
440 | stop_critical_timings(); | 440 | stop_critical_timings(); |
441 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 441 | mwait_idle_with_hints(eax, ecx); |
442 | cpu_relax(); /* allow HT sibling to run */ | ||
443 | __mwait(eax, ecx); | ||
444 | start_critical_timings(); | 442 | start_critical_timings(); |
445 | atomic_inc(&idle_wakeup_counter); | 443 | atomic_inc(&idle_wakeup_counter); |
446 | } | 444 | } |
447 | tick_nohz_idle_exit(); | 445 | tick_nohz_idle_exit(); |
448 | preempt_enable_no_resched(); | 446 | preempt_enable(); |
449 | } | 447 | } |
450 | del_timer_sync(&wakeup_timer); | 448 | del_timer_sync(&wakeup_timer); |
451 | clear_bit(cpunr, cpu_clamping_mask); | 449 | clear_bit(cpunr, cpu_clamping_mask); |
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 27b1bcffe408..86c12c93e3cf 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h | |||
@@ -1,9 +1,35 @@ | |||
1 | #ifndef _LINUX_BH_H | 1 | #ifndef _LINUX_BH_H |
2 | #define _LINUX_BH_H | 2 | #define _LINUX_BH_H |
3 | 3 | ||
4 | extern void local_bh_disable(void); | 4 | #include <linux/preempt.h> |
5 | #include <linux/preempt_mask.h> | ||
6 | |||
7 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
8 | extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); | ||
9 | #else | ||
10 | static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) | ||
11 | { | ||
12 | preempt_count_add(cnt); | ||
13 | barrier(); | ||
14 | } | ||
15 | #endif | ||
16 | |||
17 | static inline void local_bh_disable(void) | ||
18 | { | ||
19 | __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); | ||
20 | } | ||
21 | |||
5 | extern void _local_bh_enable(void); | 22 | extern void _local_bh_enable(void); |
6 | extern void local_bh_enable(void); | 23 | extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); |
7 | extern void local_bh_enable_ip(unsigned long ip); | 24 | |
25 | static inline void local_bh_enable_ip(unsigned long ip) | ||
26 | { | ||
27 | __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); | ||
28 | } | ||
29 | |||
30 | static inline void local_bh_enable(void) | ||
31 | { | ||
32 | __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); | ||
33 | } | ||
8 | 34 | ||
9 | #endif /* _LINUX_BH_H */ | 35 | #endif /* _LINUX_BH_H */ |
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d9cf963ac832..12d5f972f23f 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/lockdep.h> | 5 | #include <linux/lockdep.h> |
6 | #include <linux/ftrace_irq.h> | 6 | #include <linux/ftrace_irq.h> |
7 | #include <linux/vtime.h> | 7 | #include <linux/vtime.h> |
8 | #include <asm/hardirq.h> | ||
8 | 9 | ||
9 | 10 | ||
10 | extern void synchronize_irq(unsigned int irq); | 11 | extern void synchronize_irq(unsigned int irq); |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b0ed422e4e4a..f0e52383a001 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/securebits.h> | 12 | #include <linux/securebits.h> |
13 | #include <linux/seqlock.h> | 13 | #include <linux/seqlock.h> |
14 | #include <linux/rbtree.h> | ||
14 | #include <net/net_namespace.h> | 15 | #include <net/net_namespace.h> |
15 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> |
16 | 17 | ||
@@ -154,6 +155,14 @@ extern struct task_group root_task_group; | |||
154 | 155 | ||
155 | #define INIT_TASK_COMM "swapper" | 156 | #define INIT_TASK_COMM "swapper" |
156 | 157 | ||
158 | #ifdef CONFIG_RT_MUTEXES | ||
159 | # define INIT_RT_MUTEXES(tsk) \ | ||
160 | .pi_waiters = RB_ROOT, \ | ||
161 | .pi_waiters_leftmost = NULL, | ||
162 | #else | ||
163 | # define INIT_RT_MUTEXES(tsk) | ||
164 | #endif | ||
165 | |||
157 | /* | 166 | /* |
158 | * INIT_TASK is used to set up the first task table, touch at | 167 | * INIT_TASK is used to set up the first task table, touch at |
159 | * your own risk!. Base=0, limit=0x1fffff (=2MB) | 168 | * your own risk!. Base=0, limit=0x1fffff (=2MB) |
@@ -221,6 +230,7 @@ extern struct task_group root_task_group; | |||
221 | INIT_TRACE_RECURSION \ | 230 | INIT_TRACE_RECURSION \ |
222 | INIT_TASK_RCU_PREEMPT(tsk) \ | 231 | INIT_TASK_RCU_PREEMPT(tsk) \ |
223 | INIT_CPUSET_SEQ(tsk) \ | 232 | INIT_CPUSET_SEQ(tsk) \ |
233 | INIT_RT_MUTEXES(tsk) \ | ||
224 | INIT_VTIME(tsk) \ | 234 | INIT_VTIME(tsk) \ |
225 | } | 235 | } |
226 | 236 | ||
diff --git a/include/linux/preempt.h b/include/linux/preempt.h index a3d9dc8c2c00..59749fc48328 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h | |||
@@ -64,7 +64,11 @@ do { \ | |||
64 | } while (0) | 64 | } while (0) |
65 | 65 | ||
66 | #else | 66 | #else |
67 | #define preempt_enable() preempt_enable_no_resched() | 67 | #define preempt_enable() \ |
68 | do { \ | ||
69 | barrier(); \ | ||
70 | preempt_count_dec(); \ | ||
71 | } while (0) | ||
68 | #define preempt_check_resched() do { } while (0) | 72 | #define preempt_check_resched() do { } while (0) |
69 | #endif | 73 | #endif |
70 | 74 | ||
@@ -93,7 +97,11 @@ do { \ | |||
93 | __preempt_schedule_context(); \ | 97 | __preempt_schedule_context(); \ |
94 | } while (0) | 98 | } while (0) |
95 | #else | 99 | #else |
96 | #define preempt_enable_notrace() preempt_enable_no_resched_notrace() | 100 | #define preempt_enable_notrace() \ |
101 | do { \ | ||
102 | barrier(); \ | ||
103 | __preempt_count_dec(); \ | ||
104 | } while (0) | ||
97 | #endif | 105 | #endif |
98 | 106 | ||
99 | #else /* !CONFIG_PREEMPT_COUNT */ | 107 | #else /* !CONFIG_PREEMPT_COUNT */ |
@@ -116,6 +124,31 @@ do { \ | |||
116 | 124 | ||
117 | #endif /* CONFIG_PREEMPT_COUNT */ | 125 | #endif /* CONFIG_PREEMPT_COUNT */ |
118 | 126 | ||
127 | #ifdef MODULE | ||
128 | /* | ||
129 | * Modules have no business playing preemption tricks. | ||
130 | */ | ||
131 | #undef sched_preempt_enable_no_resched | ||
132 | #undef preempt_enable_no_resched | ||
133 | #undef preempt_enable_no_resched_notrace | ||
134 | #undef preempt_check_resched | ||
135 | #endif | ||
136 | |||
137 | #ifdef CONFIG_PREEMPT | ||
138 | #define preempt_set_need_resched() \ | ||
139 | do { \ | ||
140 | set_preempt_need_resched(); \ | ||
141 | } while (0) | ||
142 | #define preempt_fold_need_resched() \ | ||
143 | do { \ | ||
144 | if (tif_need_resched()) \ | ||
145 | set_preempt_need_resched(); \ | ||
146 | } while (0) | ||
147 | #else | ||
148 | #define preempt_set_need_resched() do { } while (0) | ||
149 | #define preempt_fold_need_resched() do { } while (0) | ||
150 | #endif | ||
151 | |||
119 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 152 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
120 | 153 | ||
121 | struct preempt_notifier; | 154 | struct preempt_notifier; |
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h index d169820203dd..dbeec4d4a3be 100644 --- a/include/linux/preempt_mask.h +++ b/include/linux/preempt_mask.h | |||
@@ -2,7 +2,6 @@ | |||
2 | #define LINUX_PREEMPT_MASK_H | 2 | #define LINUX_PREEMPT_MASK_H |
3 | 3 | ||
4 | #include <linux/preempt.h> | 4 | #include <linux/preempt.h> |
5 | #include <asm/hardirq.h> | ||
6 | 5 | ||
7 | /* | 6 | /* |
8 | * We put the hardirq and softirq counter into the preemption | 7 | * We put the hardirq and softirq counter into the preemption |
@@ -79,6 +78,21 @@ | |||
79 | #endif | 78 | #endif |
80 | 79 | ||
81 | /* | 80 | /* |
81 | * The preempt_count offset needed for things like: | ||
82 | * | ||
83 | * spin_lock_bh() | ||
84 | * | ||
85 | * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and | ||
86 | * softirqs, such that unlock sequences of: | ||
87 | * | ||
88 | * spin_unlock(); | ||
89 | * local_bh_enable(); | ||
90 | * | ||
91 | * Work as expected. | ||
92 | */ | ||
93 | #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) | ||
94 | |||
95 | /* | ||
82 | * Are we running in atomic context? WARNING: this macro cannot | 96 | * Are we running in atomic context? WARNING: this macro cannot |
83 | * always detect atomic context; in particular, it cannot know about | 97 | * always detect atomic context; in particular, it cannot know about |
84 | * held spinlocks in non-preemptible kernels. Thus it should not be | 98 | * held spinlocks in non-preemptible kernels. Thus it should not be |
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index de17134244f3..3aed8d737e1a 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h | |||
@@ -13,7 +13,7 @@ | |||
13 | #define __LINUX_RT_MUTEX_H | 13 | #define __LINUX_RT_MUTEX_H |
14 | 14 | ||
15 | #include <linux/linkage.h> | 15 | #include <linux/linkage.h> |
16 | #include <linux/plist.h> | 16 | #include <linux/rbtree.h> |
17 | #include <linux/spinlock_types.h> | 17 | #include <linux/spinlock_types.h> |
18 | 18 | ||
19 | extern int max_lock_depth; /* for sysctl */ | 19 | extern int max_lock_depth; /* for sysctl */ |
@@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */ | |||
22 | * The rt_mutex structure | 22 | * The rt_mutex structure |
23 | * | 23 | * |
24 | * @wait_lock: spinlock to protect the structure | 24 | * @wait_lock: spinlock to protect the structure |
25 | * @wait_list: pilist head to enqueue waiters in priority order | 25 | * @waiters: rbtree root to enqueue waiters in priority order |
26 | * @waiters_leftmost: top waiter | ||
26 | * @owner: the mutex owner | 27 | * @owner: the mutex owner |
27 | */ | 28 | */ |
28 | struct rt_mutex { | 29 | struct rt_mutex { |
29 | raw_spinlock_t wait_lock; | 30 | raw_spinlock_t wait_lock; |
30 | struct plist_head wait_list; | 31 | struct rb_root waiters; |
32 | struct rb_node *waiters_leftmost; | ||
31 | struct task_struct *owner; | 33 | struct task_struct *owner; |
32 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 34 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
33 | int save_state; | 35 | int save_state; |
@@ -66,7 +68,7 @@ struct hrtimer_sleeper; | |||
66 | 68 | ||
67 | #define __RT_MUTEX_INITIALIZER(mutexname) \ | 69 | #define __RT_MUTEX_INITIALIZER(mutexname) \ |
68 | { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ | 70 | { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ |
69 | , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ | 71 | , .waiters = RB_ROOT \ |
70 | , .owner = NULL \ | 72 | , .owner = NULL \ |
71 | __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} | 73 | __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} |
72 | 74 | ||
@@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock); | |||
98 | 100 | ||
99 | extern void rt_mutex_unlock(struct rt_mutex *lock); | 101 | extern void rt_mutex_unlock(struct rt_mutex *lock); |
100 | 102 | ||
101 | #ifdef CONFIG_RT_MUTEXES | ||
102 | # define INIT_RT_MUTEXES(tsk) \ | ||
103 | .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ | ||
104 | INIT_RT_MUTEX_DEBUG(tsk) | ||
105 | #else | ||
106 | # define INIT_RT_MUTEXES(tsk) | ||
107 | #endif | ||
108 | |||
109 | #endif | 103 | #endif |
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 9c9f0495d37c..5b9b84b20407 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h | |||
@@ -172,8 +172,7 @@ static inline void __raw_read_lock_irq(rwlock_t *lock) | |||
172 | 172 | ||
173 | static inline void __raw_read_lock_bh(rwlock_t *lock) | 173 | static inline void __raw_read_lock_bh(rwlock_t *lock) |
174 | { | 174 | { |
175 | local_bh_disable(); | 175 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
176 | preempt_disable(); | ||
177 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 176 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
178 | LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); | 177 | LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); |
179 | } | 178 | } |
@@ -200,8 +199,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock) | |||
200 | 199 | ||
201 | static inline void __raw_write_lock_bh(rwlock_t *lock) | 200 | static inline void __raw_write_lock_bh(rwlock_t *lock) |
202 | { | 201 | { |
203 | local_bh_disable(); | 202 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
204 | preempt_disable(); | ||
205 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 203 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
206 | LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); | 204 | LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); |
207 | } | 205 | } |
@@ -250,8 +248,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock) | |||
250 | { | 248 | { |
251 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 249 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
252 | do_raw_read_unlock(lock); | 250 | do_raw_read_unlock(lock); |
253 | preempt_enable_no_resched(); | 251 | __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
254 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | ||
255 | } | 252 | } |
256 | 253 | ||
257 | static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, | 254 | static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, |
@@ -275,8 +272,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock) | |||
275 | { | 272 | { |
276 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 273 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
277 | do_raw_write_unlock(lock); | 274 | do_raw_write_unlock(lock); |
278 | preempt_enable_no_resched(); | 275 | __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
279 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | ||
280 | } | 276 | } |
281 | 277 | ||
282 | #endif /* __LINUX_RWLOCK_API_SMP_H */ | 278 | #endif /* __LINUX_RWLOCK_API_SMP_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 53f97eb8dbc7..ffccdad050b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -16,6 +16,7 @@ struct sched_param { | |||
16 | #include <linux/types.h> | 16 | #include <linux/types.h> |
17 | #include <linux/timex.h> | 17 | #include <linux/timex.h> |
18 | #include <linux/jiffies.h> | 18 | #include <linux/jiffies.h> |
19 | #include <linux/plist.h> | ||
19 | #include <linux/rbtree.h> | 20 | #include <linux/rbtree.h> |
20 | #include <linux/thread_info.h> | 21 | #include <linux/thread_info.h> |
21 | #include <linux/cpumask.h> | 22 | #include <linux/cpumask.h> |
@@ -56,6 +57,70 @@ struct sched_param { | |||
56 | 57 | ||
57 | #include <asm/processor.h> | 58 | #include <asm/processor.h> |
58 | 59 | ||
60 | #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ | ||
61 | |||
62 | /* | ||
63 | * Extended scheduling parameters data structure. | ||
64 | * | ||
65 | * This is needed because the original struct sched_param can not be | ||
66 | * altered without introducing ABI issues with legacy applications | ||
67 | * (e.g., in sched_getparam()). | ||
68 | * | ||
69 | * However, the possibility of specifying more than just a priority for | ||
70 | * the tasks may be useful for a wide variety of application fields, e.g., | ||
71 | * multimedia, streaming, automation and control, and many others. | ||
72 | * | ||
73 | * This variant (sched_attr) is meant at describing a so-called | ||
74 | * sporadic time-constrained task. In such model a task is specified by: | ||
75 | * - the activation period or minimum instance inter-arrival time; | ||
76 | * - the maximum (or average, depending on the actual scheduling | ||
77 | * discipline) computation time of all instances, a.k.a. runtime; | ||
78 | * - the deadline (relative to the actual activation time) of each | ||
79 | * instance. | ||
80 | * Very briefly, a periodic (sporadic) task asks for the execution of | ||
81 | * some specific computation --which is typically called an instance-- | ||
82 | * (at most) every period. Moreover, each instance typically lasts no more | ||
83 | * than the runtime and must be completed by time instant t equal to | ||
84 | * the instance activation time + the deadline. | ||
85 | * | ||
86 | * This is reflected by the actual fields of the sched_attr structure: | ||
87 | * | ||
88 | * @size size of the structure, for fwd/bwd compat. | ||
89 | * | ||
90 | * @sched_policy task's scheduling policy | ||
91 | * @sched_flags for customizing the scheduler behaviour | ||
92 | * @sched_nice task's nice value (SCHED_NORMAL/BATCH) | ||
93 | * @sched_priority task's static priority (SCHED_FIFO/RR) | ||
94 | * @sched_deadline representative of the task's deadline | ||
95 | * @sched_runtime representative of the task's runtime | ||
96 | * @sched_period representative of the task's period | ||
97 | * | ||
98 | * Given this task model, there are a multiplicity of scheduling algorithms | ||
99 | * and policies, that can be used to ensure all the tasks will make their | ||
100 | * timing constraints. | ||
101 | * | ||
102 | * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the | ||
103 | * only user of this new interface. More information about the algorithm | ||
104 | * available in the scheduling class file or in Documentation/. | ||
105 | */ | ||
106 | struct sched_attr { | ||
107 | u32 size; | ||
108 | |||
109 | u32 sched_policy; | ||
110 | u64 sched_flags; | ||
111 | |||
112 | /* SCHED_NORMAL, SCHED_BATCH */ | ||
113 | s32 sched_nice; | ||
114 | |||
115 | /* SCHED_FIFO, SCHED_RR */ | ||
116 | u32 sched_priority; | ||
117 | |||
118 | /* SCHED_DEADLINE */ | ||
119 | u64 sched_runtime; | ||
120 | u64 sched_deadline; | ||
121 | u64 sched_period; | ||
122 | }; | ||
123 | |||
59 | struct exec_domain; | 124 | struct exec_domain; |
60 | struct futex_pi_state; | 125 | struct futex_pi_state; |
61 | struct robust_list_head; | 126 | struct robust_list_head; |
@@ -168,7 +233,6 @@ extern char ___assert_task_state[1 - 2*!!( | |||
168 | 233 | ||
169 | #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) | 234 | #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) |
170 | #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) | 235 | #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) |
171 | #define task_is_dead(task) ((task)->exit_state != 0) | ||
172 | #define task_is_stopped_or_traced(task) \ | 236 | #define task_is_stopped_or_traced(task) \ |
173 | ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) | 237 | ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) |
174 | #define task_contributes_to_load(task) \ | 238 | #define task_contributes_to_load(task) \ |
@@ -1029,6 +1093,51 @@ struct sched_rt_entity { | |||
1029 | #endif | 1093 | #endif |
1030 | }; | 1094 | }; |
1031 | 1095 | ||
1096 | struct sched_dl_entity { | ||
1097 | struct rb_node rb_node; | ||
1098 | |||
1099 | /* | ||
1100 | * Original scheduling parameters. Copied here from sched_attr | ||
1101 | * during sched_setscheduler2(), they will remain the same until | ||
1102 | * the next sched_setscheduler2(). | ||
1103 | */ | ||
1104 | u64 dl_runtime; /* maximum runtime for each instance */ | ||
1105 | u64 dl_deadline; /* relative deadline of each instance */ | ||
1106 | u64 dl_period; /* separation of two instances (period) */ | ||
1107 | u64 dl_bw; /* dl_runtime / dl_deadline */ | ||
1108 | |||
1109 | /* | ||
1110 | * Actual scheduling parameters. Initialized with the values above, | ||
1111 | * they are continously updated during task execution. Note that | ||
1112 | * the remaining runtime could be < 0 in case we are in overrun. | ||
1113 | */ | ||
1114 | s64 runtime; /* remaining runtime for this instance */ | ||
1115 | u64 deadline; /* absolute deadline for this instance */ | ||
1116 | unsigned int flags; /* specifying the scheduler behaviour */ | ||
1117 | |||
1118 | /* | ||
1119 | * Some bool flags: | ||
1120 | * | ||
1121 | * @dl_throttled tells if we exhausted the runtime. If so, the | ||
1122 | * task has to wait for a replenishment to be performed at the | ||
1123 | * next firing of dl_timer. | ||
1124 | * | ||
1125 | * @dl_new tells if a new instance arrived. If so we must | ||
1126 | * start executing it with full runtime and reset its absolute | ||
1127 | * deadline; | ||
1128 | * | ||
1129 | * @dl_boosted tells if we are boosted due to DI. If so we are | ||
1130 | * outside bandwidth enforcement mechanism (but only until we | ||
1131 | * exit the critical section). | ||
1132 | */ | ||
1133 | int dl_throttled, dl_new, dl_boosted; | ||
1134 | |||
1135 | /* | ||
1136 | * Bandwidth enforcement timer. Each -deadline task has its | ||
1137 | * own bandwidth to be enforced, thus we need one timer per task. | ||
1138 | */ | ||
1139 | struct hrtimer dl_timer; | ||
1140 | }; | ||
1032 | 1141 | ||
1033 | struct rcu_node; | 1142 | struct rcu_node; |
1034 | 1143 | ||
@@ -1065,6 +1174,7 @@ struct task_struct { | |||
1065 | #ifdef CONFIG_CGROUP_SCHED | 1174 | #ifdef CONFIG_CGROUP_SCHED |
1066 | struct task_group *sched_task_group; | 1175 | struct task_group *sched_task_group; |
1067 | #endif | 1176 | #endif |
1177 | struct sched_dl_entity dl; | ||
1068 | 1178 | ||
1069 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1179 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1070 | /* list of struct preempt_notifier: */ | 1180 | /* list of struct preempt_notifier: */ |
@@ -1098,6 +1208,7 @@ struct task_struct { | |||
1098 | struct list_head tasks; | 1208 | struct list_head tasks; |
1099 | #ifdef CONFIG_SMP | 1209 | #ifdef CONFIG_SMP |
1100 | struct plist_node pushable_tasks; | 1210 | struct plist_node pushable_tasks; |
1211 | struct rb_node pushable_dl_tasks; | ||
1101 | #endif | 1212 | #endif |
1102 | 1213 | ||
1103 | struct mm_struct *mm, *active_mm; | 1214 | struct mm_struct *mm, *active_mm; |
@@ -1249,9 +1360,12 @@ struct task_struct { | |||
1249 | 1360 | ||
1250 | #ifdef CONFIG_RT_MUTEXES | 1361 | #ifdef CONFIG_RT_MUTEXES |
1251 | /* PI waiters blocked on a rt_mutex held by this task */ | 1362 | /* PI waiters blocked on a rt_mutex held by this task */ |
1252 | struct plist_head pi_waiters; | 1363 | struct rb_root pi_waiters; |
1364 | struct rb_node *pi_waiters_leftmost; | ||
1253 | /* Deadlock detection and priority inheritance handling */ | 1365 | /* Deadlock detection and priority inheritance handling */ |
1254 | struct rt_mutex_waiter *pi_blocked_on; | 1366 | struct rt_mutex_waiter *pi_blocked_on; |
1367 | /* Top pi_waiters task */ | ||
1368 | struct task_struct *pi_top_task; | ||
1255 | #endif | 1369 | #endif |
1256 | 1370 | ||
1257 | #ifdef CONFIG_DEBUG_MUTEXES | 1371 | #ifdef CONFIG_DEBUG_MUTEXES |
@@ -1880,7 +1994,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
1880 | * but then during bootup it turns out that sched_clock() | 1994 | * but then during bootup it turns out that sched_clock() |
1881 | * is reliable after all: | 1995 | * is reliable after all: |
1882 | */ | 1996 | */ |
1883 | extern int sched_clock_stable; | 1997 | extern int sched_clock_stable(void); |
1998 | extern void set_sched_clock_stable(void); | ||
1999 | extern void clear_sched_clock_stable(void); | ||
1884 | 2000 | ||
1885 | extern void sched_clock_tick(void); | 2001 | extern void sched_clock_tick(void); |
1886 | extern void sched_clock_idle_sleep_event(void); | 2002 | extern void sched_clock_idle_sleep_event(void); |
@@ -1959,6 +2075,8 @@ extern int sched_setscheduler(struct task_struct *, int, | |||
1959 | const struct sched_param *); | 2075 | const struct sched_param *); |
1960 | extern int sched_setscheduler_nocheck(struct task_struct *, int, | 2076 | extern int sched_setscheduler_nocheck(struct task_struct *, int, |
1961 | const struct sched_param *); | 2077 | const struct sched_param *); |
2078 | extern int sched_setattr(struct task_struct *, | ||
2079 | const struct sched_attr *); | ||
1962 | extern struct task_struct *idle_task(int cpu); | 2080 | extern struct task_struct *idle_task(int cpu); |
1963 | /** | 2081 | /** |
1964 | * is_idle_task - is the specified task an idle task? | 2082 | * is_idle_task - is the specified task an idle task? |
@@ -2038,7 +2156,7 @@ extern void wake_up_new_task(struct task_struct *tsk); | |||
2038 | #else | 2156 | #else |
2039 | static inline void kick_process(struct task_struct *tsk) { } | 2157 | static inline void kick_process(struct task_struct *tsk) { } |
2040 | #endif | 2158 | #endif |
2041 | extern void sched_fork(unsigned long clone_flags, struct task_struct *p); | 2159 | extern int sched_fork(unsigned long clone_flags, struct task_struct *p); |
2042 | extern void sched_dead(struct task_struct *p); | 2160 | extern void sched_dead(struct task_struct *p); |
2043 | 2161 | ||
2044 | extern void proc_caches_init(void); | 2162 | extern void proc_caches_init(void); |
@@ -2627,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void) | |||
2627 | } | 2745 | } |
2628 | #endif | 2746 | #endif |
2629 | 2747 | ||
2748 | static inline void current_clr_polling(void) | ||
2749 | { | ||
2750 | __current_clr_polling(); | ||
2751 | |||
2752 | /* | ||
2753 | * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. | ||
2754 | * Once the bit is cleared, we'll get IPIs with every new | ||
2755 | * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also | ||
2756 | * fold. | ||
2757 | */ | ||
2758 | smp_mb(); /* paired with resched_task() */ | ||
2759 | |||
2760 | preempt_fold_need_resched(); | ||
2761 | } | ||
2762 | |||
2630 | static __always_inline bool need_resched(void) | 2763 | static __always_inline bool need_resched(void) |
2631 | { | 2764 | { |
2632 | return unlikely(tif_need_resched()); | 2765 | return unlikely(tif_need_resched()); |
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h new file mode 100644 index 000000000000..9d303b8847df --- /dev/null +++ b/include/linux/sched/deadline.h | |||
@@ -0,0 +1,24 @@ | |||
1 | #ifndef _SCHED_DEADLINE_H | ||
2 | #define _SCHED_DEADLINE_H | ||
3 | |||
4 | /* | ||
5 | * SCHED_DEADLINE tasks has negative priorities, reflecting | ||
6 | * the fact that any of them has higher prio than RT and | ||
7 | * NORMAL/BATCH tasks. | ||
8 | */ | ||
9 | |||
10 | #define MAX_DL_PRIO 0 | ||
11 | |||
12 | static inline int dl_prio(int prio) | ||
13 | { | ||
14 | if (unlikely(prio < MAX_DL_PRIO)) | ||
15 | return 1; | ||
16 | return 0; | ||
17 | } | ||
18 | |||
19 | static inline int dl_task(struct task_struct *p) | ||
20 | { | ||
21 | return dl_prio(p->prio); | ||
22 | } | ||
23 | |||
24 | #endif /* _SCHED_DEADLINE_H */ | ||
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 440434df3627..34e4ebea8fce 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h | |||
@@ -35,6 +35,7 @@ static inline int rt_task(struct task_struct *p) | |||
35 | #ifdef CONFIG_RT_MUTEXES | 35 | #ifdef CONFIG_RT_MUTEXES |
36 | extern int rt_mutex_getprio(struct task_struct *p); | 36 | extern int rt_mutex_getprio(struct task_struct *p); |
37 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 37 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
38 | extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); | ||
38 | extern void rt_mutex_adjust_pi(struct task_struct *p); | 39 | extern void rt_mutex_adjust_pi(struct task_struct *p); |
39 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) | 40 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) |
40 | { | 41 | { |
@@ -45,6 +46,10 @@ static inline int rt_mutex_getprio(struct task_struct *p) | |||
45 | { | 46 | { |
46 | return p->normal_prio; | 47 | return p->normal_prio; |
47 | } | 48 | } |
49 | static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) | ||
50 | { | ||
51 | return NULL; | ||
52 | } | ||
48 | # define rt_mutex_adjust_pi(p) do { } while (0) | 53 | # define rt_mutex_adjust_pi(p) do { } while (0) |
49 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) | 54 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) |
50 | { | 55 | { |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 41467f8ff8ec..31e0193cb0c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -48,7 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay; | |||
48 | extern unsigned int sysctl_numa_balancing_scan_period_min; | 48 | extern unsigned int sysctl_numa_balancing_scan_period_min; |
49 | extern unsigned int sysctl_numa_balancing_scan_period_max; | 49 | extern unsigned int sysctl_numa_balancing_scan_period_max; |
50 | extern unsigned int sysctl_numa_balancing_scan_size; | 50 | extern unsigned int sysctl_numa_balancing_scan_size; |
51 | extern unsigned int sysctl_numa_balancing_settle_count; | ||
52 | 51 | ||
53 | #ifdef CONFIG_SCHED_DEBUG | 52 | #ifdef CONFIG_SCHED_DEBUG |
54 | extern unsigned int sysctl_sched_migration_cost; | 53 | extern unsigned int sysctl_sched_migration_cost; |
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index bdb9993f0fda..42dfab89e740 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h | |||
@@ -131,8 +131,7 @@ static inline void __raw_spin_lock_irq(raw_spinlock_t *lock) | |||
131 | 131 | ||
132 | static inline void __raw_spin_lock_bh(raw_spinlock_t *lock) | 132 | static inline void __raw_spin_lock_bh(raw_spinlock_t *lock) |
133 | { | 133 | { |
134 | local_bh_disable(); | 134 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
135 | preempt_disable(); | ||
136 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 135 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
137 | LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); | 136 | LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); |
138 | } | 137 | } |
@@ -174,20 +173,17 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock) | |||
174 | { | 173 | { |
175 | spin_release(&lock->dep_map, 1, _RET_IP_); | 174 | spin_release(&lock->dep_map, 1, _RET_IP_); |
176 | do_raw_spin_unlock(lock); | 175 | do_raw_spin_unlock(lock); |
177 | preempt_enable_no_resched(); | 176 | __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
178 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | ||
179 | } | 177 | } |
180 | 178 | ||
181 | static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) | 179 | static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) |
182 | { | 180 | { |
183 | local_bh_disable(); | 181 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
184 | preempt_disable(); | ||
185 | if (do_raw_spin_trylock(lock)) { | 182 | if (do_raw_spin_trylock(lock)) { |
186 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | 183 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); |
187 | return 1; | 184 | return 1; |
188 | } | 185 | } |
189 | preempt_enable_no_resched(); | 186 | __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); |
190 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | ||
191 | return 0; | 187 | return 0; |
192 | } | 188 | } |
193 | 189 | ||
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index af1f47229e70..d0d188861ad6 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h | |||
@@ -24,11 +24,14 @@ | |||
24 | * flags straight, to suppress compiler warnings of unused lock | 24 | * flags straight, to suppress compiler warnings of unused lock |
25 | * variables, and to add the proper checker annotations: | 25 | * variables, and to add the proper checker annotations: |
26 | */ | 26 | */ |
27 | #define ___LOCK(lock) \ | ||
28 | do { __acquire(lock); (void)(lock); } while (0) | ||
29 | |||
27 | #define __LOCK(lock) \ | 30 | #define __LOCK(lock) \ |
28 | do { preempt_disable(); __acquire(lock); (void)(lock); } while (0) | 31 | do { preempt_disable(); ___LOCK(lock); } while (0) |
29 | 32 | ||
30 | #define __LOCK_BH(lock) \ | 33 | #define __LOCK_BH(lock) \ |
31 | do { local_bh_disable(); __LOCK(lock); } while (0) | 34 | do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0) |
32 | 35 | ||
33 | #define __LOCK_IRQ(lock) \ | 36 | #define __LOCK_IRQ(lock) \ |
34 | do { local_irq_disable(); __LOCK(lock); } while (0) | 37 | do { local_irq_disable(); __LOCK(lock); } while (0) |
@@ -36,12 +39,15 @@ | |||
36 | #define __LOCK_IRQSAVE(lock, flags) \ | 39 | #define __LOCK_IRQSAVE(lock, flags) \ |
37 | do { local_irq_save(flags); __LOCK(lock); } while (0) | 40 | do { local_irq_save(flags); __LOCK(lock); } while (0) |
38 | 41 | ||
42 | #define ___UNLOCK(lock) \ | ||
43 | do { __release(lock); (void)(lock); } while (0) | ||
44 | |||
39 | #define __UNLOCK(lock) \ | 45 | #define __UNLOCK(lock) \ |
40 | do { preempt_enable(); __release(lock); (void)(lock); } while (0) | 46 | do { preempt_enable(); ___UNLOCK(lock); } while (0) |
41 | 47 | ||
42 | #define __UNLOCK_BH(lock) \ | 48 | #define __UNLOCK_BH(lock) \ |
43 | do { preempt_enable_no_resched(); local_bh_enable(); \ | 49 | do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \ |
44 | __release(lock); (void)(lock); } while (0) | 50 | ___UNLOCK(lock); } while (0) |
45 | 51 | ||
46 | #define __UNLOCK_IRQ(lock) \ | 52 | #define __UNLOCK_IRQ(lock) \ |
47 | do { local_irq_enable(); __UNLOCK(lock); } while (0) | 53 | do { local_irq_enable(); __UNLOCK(lock); } while (0) |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 94273bbe6050..40ed9e9a77e5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -38,6 +38,7 @@ struct rlimit; | |||
38 | struct rlimit64; | 38 | struct rlimit64; |
39 | struct rusage; | 39 | struct rusage; |
40 | struct sched_param; | 40 | struct sched_param; |
41 | struct sched_attr; | ||
41 | struct sel_arg_struct; | 42 | struct sel_arg_struct; |
42 | struct semaphore; | 43 | struct semaphore; |
43 | struct sembuf; | 44 | struct sembuf; |
@@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | |||
279 | struct sched_param __user *param); | 280 | struct sched_param __user *param); |
280 | asmlinkage long sys_sched_setparam(pid_t pid, | 281 | asmlinkage long sys_sched_setparam(pid_t pid, |
281 | struct sched_param __user *param); | 282 | struct sched_param __user *param); |
283 | asmlinkage long sys_sched_setattr(pid_t pid, | ||
284 | struct sched_attr __user *attr); | ||
282 | asmlinkage long sys_sched_getscheduler(pid_t pid); | 285 | asmlinkage long sys_sched_getscheduler(pid_t pid); |
283 | asmlinkage long sys_sched_getparam(pid_t pid, | 286 | asmlinkage long sys_sched_getparam(pid_t pid, |
284 | struct sched_param __user *param); | 287 | struct sched_param __user *param); |
288 | asmlinkage long sys_sched_getattr(pid_t pid, | ||
289 | struct sched_attr __user *attr, | ||
290 | unsigned int size); | ||
285 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | 291 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, |
286 | unsigned long __user *user_mask_ptr); | 292 | unsigned long __user *user_mask_ptr); |
287 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | 293 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, |
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 9d8cf056e661..ecd3319dac33 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h | |||
@@ -25,13 +25,16 @@ static inline void pagefault_disable(void) | |||
25 | 25 | ||
26 | static inline void pagefault_enable(void) | 26 | static inline void pagefault_enable(void) |
27 | { | 27 | { |
28 | #ifndef CONFIG_PREEMPT | ||
28 | /* | 29 | /* |
29 | * make sure to issue those last loads/stores before enabling | 30 | * make sure to issue those last loads/stores before enabling |
30 | * the pagefault handler again. | 31 | * the pagefault handler again. |
31 | */ | 32 | */ |
32 | barrier(); | 33 | barrier(); |
33 | preempt_count_dec(); | 34 | preempt_count_dec(); |
34 | preempt_check_resched(); | 35 | #else |
36 | preempt_enable(); | ||
37 | #endif | ||
35 | } | 38 | } |
36 | 39 | ||
37 | #ifndef ARCH_HAS_NOCACHE_UACCESS | 40 | #ifndef ARCH_HAS_NOCACHE_UACCESS |
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 829627d7b846..1d67fb6b23a0 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h | |||
@@ -42,27 +42,10 @@ static inline bool net_busy_loop_on(void) | |||
42 | return sysctl_net_busy_poll; | 42 | return sysctl_net_busy_poll; |
43 | } | 43 | } |
44 | 44 | ||
45 | /* a wrapper to make debug_smp_processor_id() happy | ||
46 | * we can use sched_clock() because we don't care much about precision | ||
47 | * we only care that the average is bounded | ||
48 | */ | ||
49 | #ifdef CONFIG_DEBUG_PREEMPT | ||
50 | static inline u64 busy_loop_us_clock(void) | ||
51 | { | ||
52 | u64 rc; | ||
53 | |||
54 | preempt_disable_notrace(); | ||
55 | rc = sched_clock(); | ||
56 | preempt_enable_no_resched_notrace(); | ||
57 | |||
58 | return rc >> 10; | ||
59 | } | ||
60 | #else /* CONFIG_DEBUG_PREEMPT */ | ||
61 | static inline u64 busy_loop_us_clock(void) | 45 | static inline u64 busy_loop_us_clock(void) |
62 | { | 46 | { |
63 | return sched_clock() >> 10; | 47 | return local_clock() >> 10; |
64 | } | 48 | } |
65 | #endif /* CONFIG_DEBUG_PREEMPT */ | ||
66 | 49 | ||
67 | static inline unsigned long sk_busy_loop_end_time(struct sock *sk) | 50 | static inline unsigned long sk_busy_loop_end_time(struct sock *sk) |
68 | { | 51 | { |
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5a0f945927ac..34f9d7387d13 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h | |||
@@ -39,8 +39,14 @@ | |||
39 | #define SCHED_BATCH 3 | 39 | #define SCHED_BATCH 3 |
40 | /* SCHED_ISO: reserved but not implemented yet */ | 40 | /* SCHED_ISO: reserved but not implemented yet */ |
41 | #define SCHED_IDLE 5 | 41 | #define SCHED_IDLE 5 |
42 | #define SCHED_DEADLINE 6 | ||
43 | |||
42 | /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ | 44 | /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ |
43 | #define SCHED_RESET_ON_FORK 0x40000000 | 45 | #define SCHED_RESET_ON_FORK 0x40000000 |
44 | 46 | ||
47 | /* | ||
48 | * For the sched_{set,get}attr() calls | ||
49 | */ | ||
50 | #define SCHED_FLAG_RESET_ON_FORK 0x01 | ||
45 | 51 | ||
46 | #endif /* _UAPI_LINUX_SCHED_H */ | 52 | #endif /* _UAPI_LINUX_SCHED_H */ |
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a9a387..277f494c2a9a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -105,14 +105,17 @@ static void cpu_idle_loop(void) | |||
105 | __current_set_polling(); | 105 | __current_set_polling(); |
106 | } | 106 | } |
107 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
108 | /* | ||
109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
110 | * bit here because we might not have send the | ||
111 | * reschedule IPI to idle tasks. | ||
112 | */ | ||
113 | if (tif_need_resched()) | ||
114 | set_preempt_need_resched(); | ||
115 | } | 108 | } |
109 | |||
110 | /* | ||
111 | * Since we fell out of the loop above, we know | ||
112 | * TIF_NEED_RESCHED must be set, propagate it into | ||
113 | * PREEMPT_NEED_RESCHED. | ||
114 | * | ||
115 | * This is required because for polling idle loops we will | ||
116 | * not have had an IPI to fold the state for us. | ||
117 | */ | ||
118 | preempt_set_need_resched(); | ||
116 | tick_nohz_idle_exit(); | 119 | tick_nohz_idle_exit(); |
117 | schedule_preempt_disabled(); | 120 | schedule_preempt_disabled(); |
118 | } | 121 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index dfa736c98d17..294189fc7ac8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1087,8 +1087,10 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1087 | { | 1087 | { |
1088 | raw_spin_lock_init(&p->pi_lock); | 1088 | raw_spin_lock_init(&p->pi_lock); |
1089 | #ifdef CONFIG_RT_MUTEXES | 1089 | #ifdef CONFIG_RT_MUTEXES |
1090 | plist_head_init(&p->pi_waiters); | 1090 | p->pi_waiters = RB_ROOT; |
1091 | p->pi_waiters_leftmost = NULL; | ||
1091 | p->pi_blocked_on = NULL; | 1092 | p->pi_blocked_on = NULL; |
1093 | p->pi_top_task = NULL; | ||
1092 | #endif | 1094 | #endif |
1093 | } | 1095 | } |
1094 | 1096 | ||
@@ -1311,7 +1313,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1311 | #endif | 1313 | #endif |
1312 | 1314 | ||
1313 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1315 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1314 | sched_fork(clone_flags, p); | 1316 | retval = sched_fork(clone_flags, p); |
1317 | if (retval) | ||
1318 | goto bad_fork_cleanup_policy; | ||
1315 | 1319 | ||
1316 | retval = perf_event_init_task(p); | 1320 | retval = perf_event_init_task(p); |
1317 | if (retval) | 1321 | if (retval) |
@@ -1403,13 +1407,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1403 | p->tgid = p->pid; | 1407 | p->tgid = p->pid; |
1404 | } | 1408 | } |
1405 | 1409 | ||
1406 | p->pdeath_signal = 0; | ||
1407 | p->exit_state = 0; | ||
1408 | |||
1409 | p->nr_dirtied = 0; | 1410 | p->nr_dirtied = 0; |
1410 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | 1411 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); |
1411 | p->dirty_paused_when = 0; | 1412 | p->dirty_paused_when = 0; |
1412 | 1413 | ||
1414 | p->pdeath_signal = 0; | ||
1413 | INIT_LIST_HEAD(&p->thread_group); | 1415 | INIT_LIST_HEAD(&p->thread_group); |
1414 | p->task_works = NULL; | 1416 | p->task_works = NULL; |
1415 | 1417 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 1ddc4498f1e1..44a1261cb9ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -2426,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2426 | * code while we sleep on uaddr. | 2426 | * code while we sleep on uaddr. |
2427 | */ | 2427 | */ |
2428 | debug_rt_mutex_init_waiter(&rt_waiter); | 2428 | debug_rt_mutex_init_waiter(&rt_waiter); |
2429 | RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); | ||
2430 | RB_CLEAR_NODE(&rt_waiter.tree_entry); | ||
2429 | rt_waiter.task = NULL; | 2431 | rt_waiter.task = NULL; |
2430 | 2432 | ||
2431 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); | 2433 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..09094361dce5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/sched.h> | 46 | #include <linux/sched.h> |
47 | #include <linux/sched/sysctl.h> | 47 | #include <linux/sched/sysctl.h> |
48 | #include <linux/sched/rt.h> | 48 | #include <linux/sched/rt.h> |
49 | #include <linux/sched/deadline.h> | ||
49 | #include <linux/timer.h> | 50 | #include <linux/timer.h> |
50 | #include <linux/freezer.h> | 51 | #include <linux/freezer.h> |
51 | 52 | ||
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
1610 | unsigned long slack; | 1611 | unsigned long slack; |
1611 | 1612 | ||
1612 | slack = current->timer_slack_ns; | 1613 | slack = current->timer_slack_ns; |
1613 | if (rt_task(current)) | 1614 | if (dl_task(current) || rt_task(current)) |
1614 | slack = 0; | 1615 | slack = 0; |
1615 | 1616 | ||
1616 | hrtimer_init_on_stack(&t.timer, clockid, mode); | 1617 | hrtimer_init_on_stack(&t.timer, clockid, mode); |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
@@ -24,7 +24,7 @@ | |||
24 | #include <linux/kallsyms.h> | 24 | #include <linux/kallsyms.h> |
25 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
26 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
27 | #include <linux/plist.h> | 27 | #include <linux/rbtree.h> |
28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
29 | #include <linux/debug_locks.h> | 29 | #include <linux/debug_locks.h> |
30 | 30 | ||
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) | |||
57 | 57 | ||
58 | void rt_mutex_debug_task_free(struct task_struct *task) | 58 | void rt_mutex_debug_task_free(struct task_struct *task) |
59 | { | 59 | { |
60 | DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); | 60 | DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); |
61 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); | 61 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); |
62 | } | 62 | } |
63 | 63 | ||
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | |||
154 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | 154 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) |
155 | { | 155 | { |
156 | memset(waiter, 0x11, sizeof(*waiter)); | 156 | memset(waiter, 0x11, sizeof(*waiter)); |
157 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
158 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
159 | waiter->deadlock_task_pid = NULL; | 157 | waiter->deadlock_task_pid = NULL; |
160 | } | 158 | } |
161 | 159 | ||
162 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | 160 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) |
163 | { | 161 | { |
164 | put_pid(waiter->deadlock_task_pid); | 162 | put_pid(waiter->deadlock_task_pid); |
165 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
166 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
167 | memset(waiter, 0x22, sizeof(*waiter)); | 163 | memset(waiter, 0x22, sizeof(*waiter)); |
168 | } | 164 | } |
169 | 165 | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..2e960a2bab81 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> |
17 | #include <linux/sched/deadline.h> | ||
17 | #include <linux/timer.h> | 18 | #include <linux/timer.h> |
18 | 19 | ||
19 | #include "rtmutex_common.h" | 20 | #include "rtmutex_common.h" |
@@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | |||
91 | } | 92 | } |
92 | #endif | 93 | #endif |
93 | 94 | ||
95 | static inline int | ||
96 | rt_mutex_waiter_less(struct rt_mutex_waiter *left, | ||
97 | struct rt_mutex_waiter *right) | ||
98 | { | ||
99 | if (left->prio < right->prio) | ||
100 | return 1; | ||
101 | |||
102 | /* | ||
103 | * If both waiters have dl_prio(), we check the deadlines of the | ||
104 | * associated tasks. | ||
105 | * If left waiter has a dl_prio(), and we didn't return 1 above, | ||
106 | * then right waiter has a dl_prio() too. | ||
107 | */ | ||
108 | if (dl_prio(left->prio)) | ||
109 | return (left->task->dl.deadline < right->task->dl.deadline); | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | static void | ||
115 | rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | ||
116 | { | ||
117 | struct rb_node **link = &lock->waiters.rb_node; | ||
118 | struct rb_node *parent = NULL; | ||
119 | struct rt_mutex_waiter *entry; | ||
120 | int leftmost = 1; | ||
121 | |||
122 | while (*link) { | ||
123 | parent = *link; | ||
124 | entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); | ||
125 | if (rt_mutex_waiter_less(waiter, entry)) { | ||
126 | link = &parent->rb_left; | ||
127 | } else { | ||
128 | link = &parent->rb_right; | ||
129 | leftmost = 0; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | if (leftmost) | ||
134 | lock->waiters_leftmost = &waiter->tree_entry; | ||
135 | |||
136 | rb_link_node(&waiter->tree_entry, parent, link); | ||
137 | rb_insert_color(&waiter->tree_entry, &lock->waiters); | ||
138 | } | ||
139 | |||
140 | static void | ||
141 | rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | ||
142 | { | ||
143 | if (RB_EMPTY_NODE(&waiter->tree_entry)) | ||
144 | return; | ||
145 | |||
146 | if (lock->waiters_leftmost == &waiter->tree_entry) | ||
147 | lock->waiters_leftmost = rb_next(&waiter->tree_entry); | ||
148 | |||
149 | rb_erase(&waiter->tree_entry, &lock->waiters); | ||
150 | RB_CLEAR_NODE(&waiter->tree_entry); | ||
151 | } | ||
152 | |||
153 | static void | ||
154 | rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | ||
155 | { | ||
156 | struct rb_node **link = &task->pi_waiters.rb_node; | ||
157 | struct rb_node *parent = NULL; | ||
158 | struct rt_mutex_waiter *entry; | ||
159 | int leftmost = 1; | ||
160 | |||
161 | while (*link) { | ||
162 | parent = *link; | ||
163 | entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); | ||
164 | if (rt_mutex_waiter_less(waiter, entry)) { | ||
165 | link = &parent->rb_left; | ||
166 | } else { | ||
167 | link = &parent->rb_right; | ||
168 | leftmost = 0; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | if (leftmost) | ||
173 | task->pi_waiters_leftmost = &waiter->pi_tree_entry; | ||
174 | |||
175 | rb_link_node(&waiter->pi_tree_entry, parent, link); | ||
176 | rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); | ||
177 | } | ||
178 | |||
179 | static void | ||
180 | rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | ||
181 | { | ||
182 | if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) | ||
183 | return; | ||
184 | |||
185 | if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) | ||
186 | task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); | ||
187 | |||
188 | rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); | ||
189 | RB_CLEAR_NODE(&waiter->pi_tree_entry); | ||
190 | } | ||
191 | |||
94 | /* | 192 | /* |
95 | * Calculate task priority from the waiter list priority | 193 | * Calculate task priority from the waiter tree priority |
96 | * | 194 | * |
97 | * Return task->normal_prio when the waiter list is empty or when | 195 | * Return task->normal_prio when the waiter tree is empty or when |
98 | * the waiter is not allowed to do priority boosting | 196 | * the waiter is not allowed to do priority boosting |
99 | */ | 197 | */ |
100 | int rt_mutex_getprio(struct task_struct *task) | 198 | int rt_mutex_getprio(struct task_struct *task) |
@@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task) | |||
102 | if (likely(!task_has_pi_waiters(task))) | 200 | if (likely(!task_has_pi_waiters(task))) |
103 | return task->normal_prio; | 201 | return task->normal_prio; |
104 | 202 | ||
105 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | 203 | return min(task_top_pi_waiter(task)->prio, |
106 | task->normal_prio); | 204 | task->normal_prio); |
107 | } | 205 | } |
108 | 206 | ||
207 | struct task_struct *rt_mutex_get_top_task(struct task_struct *task) | ||
208 | { | ||
209 | if (likely(!task_has_pi_waiters(task))) | ||
210 | return NULL; | ||
211 | |||
212 | return task_top_pi_waiter(task)->task; | ||
213 | } | ||
214 | |||
109 | /* | 215 | /* |
110 | * Adjust the priority of a task, after its pi_waiters got modified. | 216 | * Adjust the priority of a task, after its pi_waiters got modified. |
111 | * | 217 | * |
@@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) | |||
115 | { | 221 | { |
116 | int prio = rt_mutex_getprio(task); | 222 | int prio = rt_mutex_getprio(task); |
117 | 223 | ||
118 | if (task->prio != prio) | 224 | if (task->prio != prio || dl_prio(prio)) |
119 | rt_mutex_setprio(task, prio); | 225 | rt_mutex_setprio(task, prio); |
120 | } | 226 | } |
121 | 227 | ||
@@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
233 | * When deadlock detection is off then we check, if further | 339 | * When deadlock detection is off then we check, if further |
234 | * priority adjustment is necessary. | 340 | * priority adjustment is necessary. |
235 | */ | 341 | */ |
236 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | 342 | if (!detect_deadlock && waiter->prio == task->prio) |
237 | goto out_unlock_pi; | 343 | goto out_unlock_pi; |
238 | 344 | ||
239 | lock = waiter->lock; | 345 | lock = waiter->lock; |
@@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
254 | top_waiter = rt_mutex_top_waiter(lock); | 360 | top_waiter = rt_mutex_top_waiter(lock); |
255 | 361 | ||
256 | /* Requeue the waiter */ | 362 | /* Requeue the waiter */ |
257 | plist_del(&waiter->list_entry, &lock->wait_list); | 363 | rt_mutex_dequeue(lock, waiter); |
258 | waiter->list_entry.prio = task->prio; | 364 | waiter->prio = task->prio; |
259 | plist_add(&waiter->list_entry, &lock->wait_list); | 365 | rt_mutex_enqueue(lock, waiter); |
260 | 366 | ||
261 | /* Release the task */ | 367 | /* Release the task */ |
262 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 368 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
@@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
280 | 386 | ||
281 | if (waiter == rt_mutex_top_waiter(lock)) { | 387 | if (waiter == rt_mutex_top_waiter(lock)) { |
282 | /* Boost the owner */ | 388 | /* Boost the owner */ |
283 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | 389 | rt_mutex_dequeue_pi(task, top_waiter); |
284 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | 390 | rt_mutex_enqueue_pi(task, waiter); |
285 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
286 | __rt_mutex_adjust_prio(task); | 391 | __rt_mutex_adjust_prio(task); |
287 | 392 | ||
288 | } else if (top_waiter == waiter) { | 393 | } else if (top_waiter == waiter) { |
289 | /* Deboost the owner */ | 394 | /* Deboost the owner */ |
290 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | 395 | rt_mutex_dequeue_pi(task, waiter); |
291 | waiter = rt_mutex_top_waiter(lock); | 396 | waiter = rt_mutex_top_waiter(lock); |
292 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | 397 | rt_mutex_enqueue_pi(task, waiter); |
293 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
294 | __rt_mutex_adjust_prio(task); | 398 | __rt_mutex_adjust_prio(task); |
295 | } | 399 | } |
296 | 400 | ||
@@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
355 | * 3) it is top waiter | 459 | * 3) it is top waiter |
356 | */ | 460 | */ |
357 | if (rt_mutex_has_waiters(lock)) { | 461 | if (rt_mutex_has_waiters(lock)) { |
358 | if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { | 462 | if (task->prio >= rt_mutex_top_waiter(lock)->prio) { |
359 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | 463 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) |
360 | return 0; | 464 | return 0; |
361 | } | 465 | } |
@@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
369 | 473 | ||
370 | /* remove the queued waiter. */ | 474 | /* remove the queued waiter. */ |
371 | if (waiter) { | 475 | if (waiter) { |
372 | plist_del(&waiter->list_entry, &lock->wait_list); | 476 | rt_mutex_dequeue(lock, waiter); |
373 | task->pi_blocked_on = NULL; | 477 | task->pi_blocked_on = NULL; |
374 | } | 478 | } |
375 | 479 | ||
@@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
379 | */ | 483 | */ |
380 | if (rt_mutex_has_waiters(lock)) { | 484 | if (rt_mutex_has_waiters(lock)) { |
381 | top = rt_mutex_top_waiter(lock); | 485 | top = rt_mutex_top_waiter(lock); |
382 | top->pi_list_entry.prio = top->list_entry.prio; | 486 | rt_mutex_enqueue_pi(task, top); |
383 | plist_add(&top->pi_list_entry, &task->pi_waiters); | ||
384 | } | 487 | } |
385 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 488 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
386 | } | 489 | } |
@@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
416 | __rt_mutex_adjust_prio(task); | 519 | __rt_mutex_adjust_prio(task); |
417 | waiter->task = task; | 520 | waiter->task = task; |
418 | waiter->lock = lock; | 521 | waiter->lock = lock; |
419 | plist_node_init(&waiter->list_entry, task->prio); | 522 | waiter->prio = task->prio; |
420 | plist_node_init(&waiter->pi_list_entry, task->prio); | ||
421 | 523 | ||
422 | /* Get the top priority waiter on the lock */ | 524 | /* Get the top priority waiter on the lock */ |
423 | if (rt_mutex_has_waiters(lock)) | 525 | if (rt_mutex_has_waiters(lock)) |
424 | top_waiter = rt_mutex_top_waiter(lock); | 526 | top_waiter = rt_mutex_top_waiter(lock); |
425 | plist_add(&waiter->list_entry, &lock->wait_list); | 527 | rt_mutex_enqueue(lock, waiter); |
426 | 528 | ||
427 | task->pi_blocked_on = waiter; | 529 | task->pi_blocked_on = waiter; |
428 | 530 | ||
@@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
433 | 535 | ||
434 | if (waiter == rt_mutex_top_waiter(lock)) { | 536 | if (waiter == rt_mutex_top_waiter(lock)) { |
435 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 537 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
436 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | 538 | rt_mutex_dequeue_pi(owner, top_waiter); |
437 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | 539 | rt_mutex_enqueue_pi(owner, waiter); |
438 | 540 | ||
439 | __rt_mutex_adjust_prio(owner); | 541 | __rt_mutex_adjust_prio(owner); |
440 | if (owner->pi_blocked_on) | 542 | if (owner->pi_blocked_on) |
@@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
486 | * boosted mode and go back to normal after releasing | 588 | * boosted mode and go back to normal after releasing |
487 | * lock->wait_lock. | 589 | * lock->wait_lock. |
488 | */ | 590 | */ |
489 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | 591 | rt_mutex_dequeue_pi(current, waiter); |
490 | 592 | ||
491 | rt_mutex_set_owner(lock, NULL); | 593 | rt_mutex_set_owner(lock, NULL); |
492 | 594 | ||
@@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock, | |||
510 | int chain_walk = 0; | 612 | int chain_walk = 0; |
511 | 613 | ||
512 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 614 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
513 | plist_del(&waiter->list_entry, &lock->wait_list); | 615 | rt_mutex_dequeue(lock, waiter); |
514 | current->pi_blocked_on = NULL; | 616 | current->pi_blocked_on = NULL; |
515 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 617 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
516 | 618 | ||
@@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
521 | 623 | ||
522 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 624 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
523 | 625 | ||
524 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | 626 | rt_mutex_dequeue_pi(owner, waiter); |
525 | 627 | ||
526 | if (rt_mutex_has_waiters(lock)) { | 628 | if (rt_mutex_has_waiters(lock)) { |
527 | struct rt_mutex_waiter *next; | 629 | struct rt_mutex_waiter *next; |
528 | 630 | ||
529 | next = rt_mutex_top_waiter(lock); | 631 | next = rt_mutex_top_waiter(lock); |
530 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | 632 | rt_mutex_enqueue_pi(owner, next); |
531 | } | 633 | } |
532 | __rt_mutex_adjust_prio(owner); | 634 | __rt_mutex_adjust_prio(owner); |
533 | 635 | ||
@@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock, | |||
537 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | 639 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); |
538 | } | 640 | } |
539 | 641 | ||
540 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
541 | |||
542 | if (!chain_walk) | 642 | if (!chain_walk) |
543 | return; | 643 | return; |
544 | 644 | ||
@@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
565 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 665 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
566 | 666 | ||
567 | waiter = task->pi_blocked_on; | 667 | waiter = task->pi_blocked_on; |
568 | if (!waiter || waiter->list_entry.prio == task->prio) { | 668 | if (!waiter || (waiter->prio == task->prio && |
669 | !dl_prio(task->prio))) { | ||
569 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 670 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
570 | return; | 671 | return; |
571 | } | 672 | } |
@@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
638 | int ret = 0; | 739 | int ret = 0; |
639 | 740 | ||
640 | debug_rt_mutex_init_waiter(&waiter); | 741 | debug_rt_mutex_init_waiter(&waiter); |
742 | RB_CLEAR_NODE(&waiter.pi_tree_entry); | ||
743 | RB_CLEAR_NODE(&waiter.tree_entry); | ||
641 | 744 | ||
642 | raw_spin_lock(&lock->wait_lock); | 745 | raw_spin_lock(&lock->wait_lock); |
643 | 746 | ||
@@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) | |||
904 | { | 1007 | { |
905 | lock->owner = NULL; | 1008 | lock->owner = NULL; |
906 | raw_spin_lock_init(&lock->wait_lock); | 1009 | raw_spin_lock_init(&lock->wait_lock); |
907 | plist_head_init(&lock->wait_list); | 1010 | lock->waiters = RB_ROOT; |
1011 | lock->waiters_leftmost = NULL; | ||
908 | 1012 | ||
909 | debug_rt_mutex_init(lock, name); | 1013 | debug_rt_mutex_init(lock, name); |
910 | } | 1014 | } |
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..7431a9c86f35 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); | |||
40 | * This is the control structure for tasks blocked on a rt_mutex, | 40 | * This is the control structure for tasks blocked on a rt_mutex, |
41 | * which is allocated on the kernel stack on of the blocked task. | 41 | * which is allocated on the kernel stack on of the blocked task. |
42 | * | 42 | * |
43 | * @list_entry: pi node to enqueue into the mutex waiters list | 43 | * @tree_entry: pi node to enqueue into the mutex waiters tree |
44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | 44 | * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree |
45 | * @task: task reference to the blocked task | 45 | * @task: task reference to the blocked task |
46 | */ | 46 | */ |
47 | struct rt_mutex_waiter { | 47 | struct rt_mutex_waiter { |
48 | struct plist_node list_entry; | 48 | struct rb_node tree_entry; |
49 | struct plist_node pi_list_entry; | 49 | struct rb_node pi_tree_entry; |
50 | struct task_struct *task; | 50 | struct task_struct *task; |
51 | struct rt_mutex *lock; | 51 | struct rt_mutex *lock; |
52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 52 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
@@ -54,14 +54,15 @@ struct rt_mutex_waiter { | |||
54 | struct pid *deadlock_task_pid; | 54 | struct pid *deadlock_task_pid; |
55 | struct rt_mutex *deadlock_lock; | 55 | struct rt_mutex *deadlock_lock; |
56 | #endif | 56 | #endif |
57 | int prio; | ||
57 | }; | 58 | }; |
58 | 59 | ||
59 | /* | 60 | /* |
60 | * Various helpers to access the waiters-plist: | 61 | * Various helpers to access the waiters-tree: |
61 | */ | 62 | */ |
62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | 63 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) |
63 | { | 64 | { |
64 | return !plist_head_empty(&lock->wait_list); | 65 | return !RB_EMPTY_ROOT(&lock->waiters); |
65 | } | 66 | } |
66 | 67 | ||
67 | static inline struct rt_mutex_waiter * | 68 | static inline struct rt_mutex_waiter * |
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) | |||
69 | { | 70 | { |
70 | struct rt_mutex_waiter *w; | 71 | struct rt_mutex_waiter *w; |
71 | 72 | ||
72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | 73 | w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, |
73 | list_entry); | 74 | tree_entry); |
74 | BUG_ON(w->lock != lock); | 75 | BUG_ON(w->lock != lock); |
75 | 76 | ||
76 | return w; | 77 | return w; |
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) | |||
78 | 79 | ||
79 | static inline int task_has_pi_waiters(struct task_struct *p) | 80 | static inline int task_has_pi_waiters(struct task_struct *p) |
80 | { | 81 | { |
81 | return !plist_head_empty(&p->pi_waiters); | 82 | return !RB_EMPTY_ROOT(&p->pi_waiters); |
82 | } | 83 | } |
83 | 84 | ||
84 | static inline struct rt_mutex_waiter * | 85 | static inline struct rt_mutex_waiter * |
85 | task_top_pi_waiter(struct task_struct *p) | 86 | task_top_pi_waiter(struct task_struct *p) |
86 | { | 87 | { |
87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | 88 | return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, |
88 | pi_list_entry); | 89 | pi_tree_entry); |
89 | } | 90 | } |
90 | 91 | ||
91 | /* | 92 | /* |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o |
15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | ||
15 | obj-y += wait.o completion.o | 16 | obj-y += wait.o completion.o |
16 | obj-$(CONFIG_SMP) += cpupri.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
18 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
19 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 20 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..6bd6a6731b21 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -26,9 +26,10 @@ | |||
26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
27 | * | 27 | * |
28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
30 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current cpu. |
31 | * | 30 | * |
31 | * sched_clock_cpu(i) | ||
32 | * | ||
32 | * How: | 33 | * How: |
33 | * | 34 | * |
34 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
@@ -50,15 +51,6 @@ | |||
50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 51 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
51 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
52 | * | 53 | * |
53 | * | ||
54 | * Notes: | ||
55 | * | ||
56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
58 | * and cause funny jumps in time -- although the filtering provided by | ||
59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
61 | * sched_clock(). | ||
62 | */ | 54 | */ |
63 | #include <linux/spinlock.h> | 55 | #include <linux/spinlock.h> |
64 | #include <linux/hardirq.h> | 56 | #include <linux/hardirq.h> |
@@ -66,6 +58,8 @@ | |||
66 | #include <linux/percpu.h> | 58 | #include <linux/percpu.h> |
67 | #include <linux/ktime.h> | 59 | #include <linux/ktime.h> |
68 | #include <linux/sched.h> | 60 | #include <linux/sched.h> |
61 | #include <linux/static_key.h> | ||
62 | #include <linux/workqueue.h> | ||
69 | 63 | ||
70 | /* | 64 | /* |
71 | * Scheduler clock - returns current time in nanosec units. | 65 | * Scheduler clock - returns current time in nanosec units. |
@@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock); | |||
82 | __read_mostly int sched_clock_running; | 76 | __read_mostly int sched_clock_running; |
83 | 77 | ||
84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 78 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
85 | __read_mostly int sched_clock_stable; | 79 | static struct static_key __sched_clock_stable = STATIC_KEY_INIT; |
80 | |||
81 | int sched_clock_stable(void) | ||
82 | { | ||
83 | if (static_key_false(&__sched_clock_stable)) | ||
84 | return false; | ||
85 | return true; | ||
86 | } | ||
87 | |||
88 | void set_sched_clock_stable(void) | ||
89 | { | ||
90 | if (!sched_clock_stable()) | ||
91 | static_key_slow_dec(&__sched_clock_stable); | ||
92 | } | ||
93 | |||
94 | static void __clear_sched_clock_stable(struct work_struct *work) | ||
95 | { | ||
96 | /* XXX worry about clock continuity */ | ||
97 | if (sched_clock_stable()) | ||
98 | static_key_slow_inc(&__sched_clock_stable); | ||
99 | } | ||
100 | |||
101 | static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); | ||
102 | |||
103 | void clear_sched_clock_stable(void) | ||
104 | { | ||
105 | if (keventd_up()) | ||
106 | schedule_work(&sched_clock_work); | ||
107 | else | ||
108 | __clear_sched_clock_stable(&sched_clock_work); | ||
109 | } | ||
86 | 110 | ||
87 | struct sched_clock_data { | 111 | struct sched_clock_data { |
88 | u64 tick_raw; | 112 | u64 tick_raw; |
@@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu) | |||
242 | struct sched_clock_data *scd; | 266 | struct sched_clock_data *scd; |
243 | u64 clock; | 267 | u64 clock; |
244 | 268 | ||
245 | WARN_ON_ONCE(!irqs_disabled()); | 269 | if (sched_clock_stable()) |
246 | |||
247 | if (sched_clock_stable) | ||
248 | return sched_clock(); | 270 | return sched_clock(); |
249 | 271 | ||
250 | if (unlikely(!sched_clock_running)) | 272 | if (unlikely(!sched_clock_running)) |
251 | return 0ull; | 273 | return 0ull; |
252 | 274 | ||
275 | preempt_disable(); | ||
253 | scd = cpu_sdc(cpu); | 276 | scd = cpu_sdc(cpu); |
254 | 277 | ||
255 | if (cpu != smp_processor_id()) | 278 | if (cpu != smp_processor_id()) |
256 | clock = sched_clock_remote(scd); | 279 | clock = sched_clock_remote(scd); |
257 | else | 280 | else |
258 | clock = sched_clock_local(scd); | 281 | clock = sched_clock_local(scd); |
282 | preempt_enable(); | ||
259 | 283 | ||
260 | return clock; | 284 | return clock; |
261 | } | 285 | } |
@@ -265,7 +289,7 @@ void sched_clock_tick(void) | |||
265 | struct sched_clock_data *scd; | 289 | struct sched_clock_data *scd; |
266 | u64 now, now_gtod; | 290 | u64 now, now_gtod; |
267 | 291 | ||
268 | if (sched_clock_stable) | 292 | if (sched_clock_stable()) |
269 | return; | 293 | return; |
270 | 294 | ||
271 | if (unlikely(!sched_clock_running)) | 295 | if (unlikely(!sched_clock_running)) |
@@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
316 | */ | 340 | */ |
317 | u64 cpu_clock(int cpu) | 341 | u64 cpu_clock(int cpu) |
318 | { | 342 | { |
319 | u64 clock; | 343 | if (static_key_false(&__sched_clock_stable)) |
320 | unsigned long flags; | 344 | return sched_clock_cpu(cpu); |
321 | |||
322 | local_irq_save(flags); | ||
323 | clock = sched_clock_cpu(cpu); | ||
324 | local_irq_restore(flags); | ||
325 | 345 | ||
326 | return clock; | 346 | return sched_clock(); |
327 | } | 347 | } |
328 | 348 | ||
329 | /* | 349 | /* |
@@ -335,14 +355,10 @@ u64 cpu_clock(int cpu) | |||
335 | */ | 355 | */ |
336 | u64 local_clock(void) | 356 | u64 local_clock(void) |
337 | { | 357 | { |
338 | u64 clock; | 358 | if (static_key_false(&__sched_clock_stable)) |
339 | unsigned long flags; | 359 | return sched_clock_cpu(raw_smp_processor_id()); |
340 | 360 | ||
341 | local_irq_save(flags); | 361 | return sched_clock(); |
342 | clock = sched_clock_cpu(smp_processor_id()); | ||
343 | local_irq_restore(flags); | ||
344 | |||
345 | return clock; | ||
346 | } | 362 | } |
347 | 363 | ||
348 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 364 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
@@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu) | |||
362 | 378 | ||
363 | u64 cpu_clock(int cpu) | 379 | u64 cpu_clock(int cpu) |
364 | { | 380 | { |
365 | return sched_clock_cpu(cpu); | 381 | return sched_clock(); |
366 | } | 382 | } |
367 | 383 | ||
368 | u64 local_clock(void) | 384 | u64 local_clock(void) |
369 | { | 385 | { |
370 | return sched_clock_cpu(0); | 386 | return sched_clock(); |
371 | } | 387 | } |
372 | 388 | ||
373 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 389 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..36c951b7eef8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running; | |||
296 | */ | 296 | */ |
297 | int sysctl_sched_rt_runtime = 950000; | 297 | int sysctl_sched_rt_runtime = 950000; |
298 | 298 | ||
299 | |||
300 | |||
301 | /* | 299 | /* |
302 | * __task_rq_lock - lock the rq @p resides on. | 300 | * __task_rq_lock - lock the rq @p resides on. |
303 | */ | 301 | */ |
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p) | |||
899 | { | 897 | { |
900 | int prio; | 898 | int prio; |
901 | 899 | ||
902 | if (task_has_rt_policy(p)) | 900 | if (task_has_dl_policy(p)) |
901 | prio = MAX_DL_PRIO-1; | ||
902 | else if (task_has_rt_policy(p)) | ||
903 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 903 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
904 | else | 904 | else |
905 | prio = __normal_prio(p); | 905 | prio = __normal_prio(p); |
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
945 | if (prev_class->switched_from) | 945 | if (prev_class->switched_from) |
946 | prev_class->switched_from(rq, p); | 946 | prev_class->switched_from(rq, p); |
947 | p->sched_class->switched_to(rq, p); | 947 | p->sched_class->switched_to(rq, p); |
948 | } else if (oldprio != p->prio) | 948 | } else if (oldprio != p->prio || dl_task(p)) |
949 | p->sched_class->prio_changed(rq, p, oldprio); | 949 | p->sched_class->prio_changed(rq, p, oldprio); |
950 | } | 950 | } |
951 | 951 | ||
@@ -1499,8 +1499,7 @@ void scheduler_ipi(void) | |||
1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | 1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send |
1500 | * this IPI. | 1500 | * this IPI. |
1501 | */ | 1501 | */ |
1502 | if (tif_need_resched()) | 1502 | preempt_fold_need_resched(); |
1503 | set_preempt_need_resched(); | ||
1504 | 1503 | ||
1505 | if (llist_empty(&this_rq()->wake_list) | 1504 | if (llist_empty(&this_rq()->wake_list) |
1506 | && !tick_nohz_full_cpu(smp_processor_id()) | 1505 | && !tick_nohz_full_cpu(smp_processor_id()) |
@@ -1717,6 +1716,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1716 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1718 | #endif | 1717 | #endif |
1719 | 1718 | ||
1719 | RB_CLEAR_NODE(&p->dl.rb_node); | ||
1720 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1721 | p->dl.dl_runtime = p->dl.runtime = 0; | ||
1722 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
1723 | p->dl.dl_period = 0; | ||
1724 | p->dl.flags = 0; | ||
1725 | |||
1720 | INIT_LIST_HEAD(&p->rt.run_list); | 1726 | INIT_LIST_HEAD(&p->rt.run_list); |
1721 | 1727 | ||
1722 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1728 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1768,7 +1774,7 @@ void set_numabalancing_state(bool enabled) | |||
1768 | /* | 1774 | /* |
1769 | * fork()/clone()-time setup: | 1775 | * fork()/clone()-time setup: |
1770 | */ | 1776 | */ |
1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) | 1777 | int sched_fork(unsigned long clone_flags, struct task_struct *p) |
1772 | { | 1778 | { |
1773 | unsigned long flags; | 1779 | unsigned long flags; |
1774 | int cpu = get_cpu(); | 1780 | int cpu = get_cpu(); |
@@ -1790,7 +1796,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1790 | * Revert to default priority/policy on fork if requested. | 1796 | * Revert to default priority/policy on fork if requested. |
1791 | */ | 1797 | */ |
1792 | if (unlikely(p->sched_reset_on_fork)) { | 1798 | if (unlikely(p->sched_reset_on_fork)) { |
1793 | if (task_has_rt_policy(p)) { | 1799 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
1794 | p->policy = SCHED_NORMAL; | 1800 | p->policy = SCHED_NORMAL; |
1795 | p->static_prio = NICE_TO_PRIO(0); | 1801 | p->static_prio = NICE_TO_PRIO(0); |
1796 | p->rt_priority = 0; | 1802 | p->rt_priority = 0; |
@@ -1807,8 +1813,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1807 | p->sched_reset_on_fork = 0; | 1813 | p->sched_reset_on_fork = 0; |
1808 | } | 1814 | } |
1809 | 1815 | ||
1810 | if (!rt_prio(p->prio)) | 1816 | if (dl_prio(p->prio)) { |
1817 | put_cpu(); | ||
1818 | return -EAGAIN; | ||
1819 | } else if (rt_prio(p->prio)) { | ||
1820 | p->sched_class = &rt_sched_class; | ||
1821 | } else { | ||
1811 | p->sched_class = &fair_sched_class; | 1822 | p->sched_class = &fair_sched_class; |
1823 | } | ||
1812 | 1824 | ||
1813 | if (p->sched_class->task_fork) | 1825 | if (p->sched_class->task_fork) |
1814 | p->sched_class->task_fork(p); | 1826 | p->sched_class->task_fork(p); |
@@ -1834,11 +1846,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1834 | init_task_preempt_count(p); | 1846 | init_task_preempt_count(p); |
1835 | #ifdef CONFIG_SMP | 1847 | #ifdef CONFIG_SMP |
1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1848 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
1849 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
1837 | #endif | 1850 | #endif |
1838 | 1851 | ||
1839 | put_cpu(); | 1852 | put_cpu(); |
1853 | return 0; | ||
1854 | } | ||
1855 | |||
1856 | unsigned long to_ratio(u64 period, u64 runtime) | ||
1857 | { | ||
1858 | if (runtime == RUNTIME_INF) | ||
1859 | return 1ULL << 20; | ||
1860 | |||
1861 | /* | ||
1862 | * Doing this here saves a lot of checks in all | ||
1863 | * the calling paths, and returning zero seems | ||
1864 | * safe for them anyway. | ||
1865 | */ | ||
1866 | if (period == 0) | ||
1867 | return 0; | ||
1868 | |||
1869 | return div64_u64(runtime << 20, period); | ||
1870 | } | ||
1871 | |||
1872 | #ifdef CONFIG_SMP | ||
1873 | inline struct dl_bw *dl_bw_of(int i) | ||
1874 | { | ||
1875 | return &cpu_rq(i)->rd->dl_bw; | ||
1840 | } | 1876 | } |
1841 | 1877 | ||
1878 | static inline int dl_bw_cpus(int i) | ||
1879 | { | ||
1880 | struct root_domain *rd = cpu_rq(i)->rd; | ||
1881 | int cpus = 0; | ||
1882 | |||
1883 | for_each_cpu_and(i, rd->span, cpu_active_mask) | ||
1884 | cpus++; | ||
1885 | |||
1886 | return cpus; | ||
1887 | } | ||
1888 | #else | ||
1889 | inline struct dl_bw *dl_bw_of(int i) | ||
1890 | { | ||
1891 | return &cpu_rq(i)->dl.dl_bw; | ||
1892 | } | ||
1893 | |||
1894 | static inline int dl_bw_cpus(int i) | ||
1895 | { | ||
1896 | return 1; | ||
1897 | } | ||
1898 | #endif | ||
1899 | |||
1900 | static inline | ||
1901 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
1902 | { | ||
1903 | dl_b->total_bw -= tsk_bw; | ||
1904 | } | ||
1905 | |||
1906 | static inline | ||
1907 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
1908 | { | ||
1909 | dl_b->total_bw += tsk_bw; | ||
1910 | } | ||
1911 | |||
1912 | static inline | ||
1913 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
1914 | { | ||
1915 | return dl_b->bw != -1 && | ||
1916 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
1917 | } | ||
1918 | |||
1919 | /* | ||
1920 | * We must be sure that accepting a new task (or allowing changing the | ||
1921 | * parameters of an existing one) is consistent with the bandwidth | ||
1922 | * constraints. If yes, this function also accordingly updates the currently | ||
1923 | * allocated bandwidth to reflect the new situation. | ||
1924 | * | ||
1925 | * This function is called while holding p's rq->lock. | ||
1926 | */ | ||
1927 | static int dl_overflow(struct task_struct *p, int policy, | ||
1928 | const struct sched_attr *attr) | ||
1929 | { | ||
1930 | |||
1931 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
1932 | u64 period = attr->sched_period; | ||
1933 | u64 runtime = attr->sched_runtime; | ||
1934 | u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; | ||
1935 | int cpus, err = -1; | ||
1936 | |||
1937 | if (new_bw == p->dl.dl_bw) | ||
1938 | return 0; | ||
1939 | |||
1940 | /* | ||
1941 | * Either if a task, enters, leave, or stays -deadline but changes | ||
1942 | * its parameters, we may need to update accordingly the total | ||
1943 | * allocated bandwidth of the container. | ||
1944 | */ | ||
1945 | raw_spin_lock(&dl_b->lock); | ||
1946 | cpus = dl_bw_cpus(task_cpu(p)); | ||
1947 | if (dl_policy(policy) && !task_has_dl_policy(p) && | ||
1948 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | ||
1949 | __dl_add(dl_b, new_bw); | ||
1950 | err = 0; | ||
1951 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | ||
1952 | !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { | ||
1953 | __dl_clear(dl_b, p->dl.dl_bw); | ||
1954 | __dl_add(dl_b, new_bw); | ||
1955 | err = 0; | ||
1956 | } else if (!dl_policy(policy) && task_has_dl_policy(p)) { | ||
1957 | __dl_clear(dl_b, p->dl.dl_bw); | ||
1958 | err = 0; | ||
1959 | } | ||
1960 | raw_spin_unlock(&dl_b->lock); | ||
1961 | |||
1962 | return err; | ||
1963 | } | ||
1964 | |||
1965 | extern void init_dl_bw(struct dl_bw *dl_b); | ||
1966 | |||
1842 | /* | 1967 | /* |
1843 | * wake_up_new_task - wake up a newly created task for the first time. | 1968 | * wake_up_new_task - wake up a newly created task for the first time. |
1844 | * | 1969 | * |
@@ -2003,6 +2128,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2003 | if (unlikely(prev_state == TASK_DEAD)) { | 2128 | if (unlikely(prev_state == TASK_DEAD)) { |
2004 | task_numa_free(prev); | 2129 | task_numa_free(prev); |
2005 | 2130 | ||
2131 | if (prev->sched_class->task_dead) | ||
2132 | prev->sched_class->task_dead(prev); | ||
2133 | |||
2006 | /* | 2134 | /* |
2007 | * Remove function-return probe instances associated with this | 2135 | * Remove function-return probe instances associated with this |
2008 | * task and put them back on the free list. | 2136 | * task and put them back on the free list. |
@@ -2296,7 +2424,7 @@ void scheduler_tick(void) | |||
2296 | 2424 | ||
2297 | #ifdef CONFIG_SMP | 2425 | #ifdef CONFIG_SMP |
2298 | rq->idle_balance = idle_cpu(cpu); | 2426 | rq->idle_balance = idle_cpu(cpu); |
2299 | trigger_load_balance(rq, cpu); | 2427 | trigger_load_balance(rq); |
2300 | #endif | 2428 | #endif |
2301 | rq_last_tick_reset(rq); | 2429 | rq_last_tick_reset(rq); |
2302 | } | 2430 | } |
@@ -2414,10 +2542,10 @@ static inline void schedule_debug(struct task_struct *prev) | |||
2414 | { | 2542 | { |
2415 | /* | 2543 | /* |
2416 | * Test if we are atomic. Since do_exit() needs to call into | 2544 | * Test if we are atomic. Since do_exit() needs to call into |
2417 | * schedule() atomically, we ignore that path for now. | 2545 | * schedule() atomically, we ignore that path. Otherwise whine |
2418 | * Otherwise, whine if we are scheduling when we should not be. | 2546 | * if we are scheduling when we should not. |
2419 | */ | 2547 | */ |
2420 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 2548 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) |
2421 | __schedule_bug(prev); | 2549 | __schedule_bug(prev); |
2422 | rcu_sleep_check(); | 2550 | rcu_sleep_check(); |
2423 | 2551 | ||
@@ -2761,11 +2889,11 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
2761 | */ | 2889 | */ |
2762 | void rt_mutex_setprio(struct task_struct *p, int prio) | 2890 | void rt_mutex_setprio(struct task_struct *p, int prio) |
2763 | { | 2891 | { |
2764 | int oldprio, on_rq, running; | 2892 | int oldprio, on_rq, running, enqueue_flag = 0; |
2765 | struct rq *rq; | 2893 | struct rq *rq; |
2766 | const struct sched_class *prev_class; | 2894 | const struct sched_class *prev_class; |
2767 | 2895 | ||
2768 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 2896 | BUG_ON(prio > MAX_PRIO); |
2769 | 2897 | ||
2770 | rq = __task_rq_lock(p); | 2898 | rq = __task_rq_lock(p); |
2771 | 2899 | ||
@@ -2788,6 +2916,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2788 | } | 2916 | } |
2789 | 2917 | ||
2790 | trace_sched_pi_setprio(p, prio); | 2918 | trace_sched_pi_setprio(p, prio); |
2919 | p->pi_top_task = rt_mutex_get_top_task(p); | ||
2791 | oldprio = p->prio; | 2920 | oldprio = p->prio; |
2792 | prev_class = p->sched_class; | 2921 | prev_class = p->sched_class; |
2793 | on_rq = p->on_rq; | 2922 | on_rq = p->on_rq; |
@@ -2797,23 +2926,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2797 | if (running) | 2926 | if (running) |
2798 | p->sched_class->put_prev_task(rq, p); | 2927 | p->sched_class->put_prev_task(rq, p); |
2799 | 2928 | ||
2800 | if (rt_prio(prio)) | 2929 | /* |
2930 | * Boosting condition are: | ||
2931 | * 1. -rt task is running and holds mutex A | ||
2932 | * --> -dl task blocks on mutex A | ||
2933 | * | ||
2934 | * 2. -dl task is running and holds mutex A | ||
2935 | * --> -dl task blocks on mutex A and could preempt the | ||
2936 | * running task | ||
2937 | */ | ||
2938 | if (dl_prio(prio)) { | ||
2939 | if (!dl_prio(p->normal_prio) || (p->pi_top_task && | ||
2940 | dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { | ||
2941 | p->dl.dl_boosted = 1; | ||
2942 | p->dl.dl_throttled = 0; | ||
2943 | enqueue_flag = ENQUEUE_REPLENISH; | ||
2944 | } else | ||
2945 | p->dl.dl_boosted = 0; | ||
2946 | p->sched_class = &dl_sched_class; | ||
2947 | } else if (rt_prio(prio)) { | ||
2948 | if (dl_prio(oldprio)) | ||
2949 | p->dl.dl_boosted = 0; | ||
2950 | if (oldprio < prio) | ||
2951 | enqueue_flag = ENQUEUE_HEAD; | ||
2801 | p->sched_class = &rt_sched_class; | 2952 | p->sched_class = &rt_sched_class; |
2802 | else | 2953 | } else { |
2954 | if (dl_prio(oldprio)) | ||
2955 | p->dl.dl_boosted = 0; | ||
2803 | p->sched_class = &fair_sched_class; | 2956 | p->sched_class = &fair_sched_class; |
2957 | } | ||
2804 | 2958 | ||
2805 | p->prio = prio; | 2959 | p->prio = prio; |
2806 | 2960 | ||
2807 | if (running) | 2961 | if (running) |
2808 | p->sched_class->set_curr_task(rq); | 2962 | p->sched_class->set_curr_task(rq); |
2809 | if (on_rq) | 2963 | if (on_rq) |
2810 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 2964 | enqueue_task(rq, p, enqueue_flag); |
2811 | 2965 | ||
2812 | check_class_changed(rq, p, prev_class, oldprio); | 2966 | check_class_changed(rq, p, prev_class, oldprio); |
2813 | out_unlock: | 2967 | out_unlock: |
2814 | __task_rq_unlock(rq); | 2968 | __task_rq_unlock(rq); |
2815 | } | 2969 | } |
2816 | #endif | 2970 | #endif |
2971 | |||
2817 | void set_user_nice(struct task_struct *p, long nice) | 2972 | void set_user_nice(struct task_struct *p, long nice) |
2818 | { | 2973 | { |
2819 | int old_prio, delta, on_rq; | 2974 | int old_prio, delta, on_rq; |
@@ -2831,9 +2986,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
2831 | * The RT priorities are set via sched_setscheduler(), but we still | 2986 | * The RT priorities are set via sched_setscheduler(), but we still |
2832 | * allow the 'normal' nice value to be set - but as expected | 2987 | * allow the 'normal' nice value to be set - but as expected |
2833 | * it wont have any effect on scheduling until the task is | 2988 | * it wont have any effect on scheduling until the task is |
2834 | * SCHED_FIFO/SCHED_RR: | 2989 | * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: |
2835 | */ | 2990 | */ |
2836 | if (task_has_rt_policy(p)) { | 2991 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
2837 | p->static_prio = NICE_TO_PRIO(nice); | 2992 | p->static_prio = NICE_TO_PRIO(nice); |
2838 | goto out_unlock; | 2993 | goto out_unlock; |
2839 | } | 2994 | } |
@@ -2988,22 +3143,95 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
2988 | return pid ? find_task_by_vpid(pid) : current; | 3143 | return pid ? find_task_by_vpid(pid) : current; |
2989 | } | 3144 | } |
2990 | 3145 | ||
2991 | /* Actually do priority change: must hold rq lock. */ | 3146 | /* |
3147 | * This function initializes the sched_dl_entity of a newly becoming | ||
3148 | * SCHED_DEADLINE task. | ||
3149 | * | ||
3150 | * Only the static values are considered here, the actual runtime and the | ||
3151 | * absolute deadline will be properly calculated when the task is enqueued | ||
3152 | * for the first time with its new policy. | ||
3153 | */ | ||
2992 | static void | 3154 | static void |
2993 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 3155 | __setparam_dl(struct task_struct *p, const struct sched_attr *attr) |
3156 | { | ||
3157 | struct sched_dl_entity *dl_se = &p->dl; | ||
3158 | |||
3159 | init_dl_task_timer(dl_se); | ||
3160 | dl_se->dl_runtime = attr->sched_runtime; | ||
3161 | dl_se->dl_deadline = attr->sched_deadline; | ||
3162 | dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; | ||
3163 | dl_se->flags = attr->sched_flags; | ||
3164 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | ||
3165 | dl_se->dl_throttled = 0; | ||
3166 | dl_se->dl_new = 1; | ||
3167 | } | ||
3168 | |||
3169 | /* Actually do priority change: must hold pi & rq lock. */ | ||
3170 | static void __setscheduler(struct rq *rq, struct task_struct *p, | ||
3171 | const struct sched_attr *attr) | ||
2994 | { | 3172 | { |
3173 | int policy = attr->sched_policy; | ||
3174 | |||
3175 | if (policy == -1) /* setparam */ | ||
3176 | policy = p->policy; | ||
3177 | |||
2995 | p->policy = policy; | 3178 | p->policy = policy; |
2996 | p->rt_priority = prio; | 3179 | |
3180 | if (dl_policy(policy)) | ||
3181 | __setparam_dl(p, attr); | ||
3182 | else if (fair_policy(policy)) | ||
3183 | p->static_prio = NICE_TO_PRIO(attr->sched_nice); | ||
3184 | |||
3185 | /* | ||
3186 | * __sched_setscheduler() ensures attr->sched_priority == 0 when | ||
3187 | * !rt_policy. Always setting this ensures that things like | ||
3188 | * getparam()/getattr() don't report silly values for !rt tasks. | ||
3189 | */ | ||
3190 | p->rt_priority = attr->sched_priority; | ||
3191 | |||
2997 | p->normal_prio = normal_prio(p); | 3192 | p->normal_prio = normal_prio(p); |
2998 | /* we are holding p->pi_lock already */ | ||
2999 | p->prio = rt_mutex_getprio(p); | 3193 | p->prio = rt_mutex_getprio(p); |
3000 | if (rt_prio(p->prio)) | 3194 | |
3195 | if (dl_prio(p->prio)) | ||
3196 | p->sched_class = &dl_sched_class; | ||
3197 | else if (rt_prio(p->prio)) | ||
3001 | p->sched_class = &rt_sched_class; | 3198 | p->sched_class = &rt_sched_class; |
3002 | else | 3199 | else |
3003 | p->sched_class = &fair_sched_class; | 3200 | p->sched_class = &fair_sched_class; |
3201 | |||
3004 | set_load_weight(p); | 3202 | set_load_weight(p); |
3005 | } | 3203 | } |
3006 | 3204 | ||
3205 | static void | ||
3206 | __getparam_dl(struct task_struct *p, struct sched_attr *attr) | ||
3207 | { | ||
3208 | struct sched_dl_entity *dl_se = &p->dl; | ||
3209 | |||
3210 | attr->sched_priority = p->rt_priority; | ||
3211 | attr->sched_runtime = dl_se->dl_runtime; | ||
3212 | attr->sched_deadline = dl_se->dl_deadline; | ||
3213 | attr->sched_period = dl_se->dl_period; | ||
3214 | attr->sched_flags = dl_se->flags; | ||
3215 | } | ||
3216 | |||
3217 | /* | ||
3218 | * This function validates the new parameters of a -deadline task. | ||
3219 | * We ask for the deadline not being zero, and greater or equal | ||
3220 | * than the runtime, as well as the period of being zero or | ||
3221 | * greater than deadline. Furthermore, we have to be sure that | ||
3222 | * user parameters are above the internal resolution (1us); we | ||
3223 | * check sched_runtime only since it is always the smaller one. | ||
3224 | */ | ||
3225 | static bool | ||
3226 | __checkparam_dl(const struct sched_attr *attr) | ||
3227 | { | ||
3228 | return attr && attr->sched_deadline != 0 && | ||
3229 | (attr->sched_period == 0 || | ||
3230 | (s64)(attr->sched_period - attr->sched_deadline) >= 0) && | ||
3231 | (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && | ||
3232 | attr->sched_runtime >= (2 << (DL_SCALE - 1)); | ||
3233 | } | ||
3234 | |||
3007 | /* | 3235 | /* |
3008 | * check the target process has a UID that matches the current process's | 3236 | * check the target process has a UID that matches the current process's |
3009 | */ | 3237 | */ |
@@ -3020,10 +3248,12 @@ static bool check_same_owner(struct task_struct *p) | |||
3020 | return match; | 3248 | return match; |
3021 | } | 3249 | } |
3022 | 3250 | ||
3023 | static int __sched_setscheduler(struct task_struct *p, int policy, | 3251 | static int __sched_setscheduler(struct task_struct *p, |
3024 | const struct sched_param *param, bool user) | 3252 | const struct sched_attr *attr, |
3253 | bool user) | ||
3025 | { | 3254 | { |
3026 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3255 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
3256 | int policy = attr->sched_policy; | ||
3027 | unsigned long flags; | 3257 | unsigned long flags; |
3028 | const struct sched_class *prev_class; | 3258 | const struct sched_class *prev_class; |
3029 | struct rq *rq; | 3259 | struct rq *rq; |
@@ -3037,31 +3267,40 @@ recheck: | |||
3037 | reset_on_fork = p->sched_reset_on_fork; | 3267 | reset_on_fork = p->sched_reset_on_fork; |
3038 | policy = oldpolicy = p->policy; | 3268 | policy = oldpolicy = p->policy; |
3039 | } else { | 3269 | } else { |
3040 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); | 3270 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
3041 | policy &= ~SCHED_RESET_ON_FORK; | ||
3042 | 3271 | ||
3043 | if (policy != SCHED_FIFO && policy != SCHED_RR && | 3272 | if (policy != SCHED_DEADLINE && |
3273 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
3044 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 3274 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
3045 | policy != SCHED_IDLE) | 3275 | policy != SCHED_IDLE) |
3046 | return -EINVAL; | 3276 | return -EINVAL; |
3047 | } | 3277 | } |
3048 | 3278 | ||
3279 | if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) | ||
3280 | return -EINVAL; | ||
3281 | |||
3049 | /* | 3282 | /* |
3050 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3283 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
3051 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 3284 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
3052 | * SCHED_BATCH and SCHED_IDLE is 0. | 3285 | * SCHED_BATCH and SCHED_IDLE is 0. |
3053 | */ | 3286 | */ |
3054 | if (param->sched_priority < 0 || | 3287 | if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || |
3055 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3288 | (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) |
3056 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | ||
3057 | return -EINVAL; | 3289 | return -EINVAL; |
3058 | if (rt_policy(policy) != (param->sched_priority != 0)) | 3290 | if ((dl_policy(policy) && !__checkparam_dl(attr)) || |
3291 | (rt_policy(policy) != (attr->sched_priority != 0))) | ||
3059 | return -EINVAL; | 3292 | return -EINVAL; |
3060 | 3293 | ||
3061 | /* | 3294 | /* |
3062 | * Allow unprivileged RT tasks to decrease priority: | 3295 | * Allow unprivileged RT tasks to decrease priority: |
3063 | */ | 3296 | */ |
3064 | if (user && !capable(CAP_SYS_NICE)) { | 3297 | if (user && !capable(CAP_SYS_NICE)) { |
3298 | if (fair_policy(policy)) { | ||
3299 | if (attr->sched_nice < TASK_NICE(p) && | ||
3300 | !can_nice(p, attr->sched_nice)) | ||
3301 | return -EPERM; | ||
3302 | } | ||
3303 | |||
3065 | if (rt_policy(policy)) { | 3304 | if (rt_policy(policy)) { |
3066 | unsigned long rlim_rtprio = | 3305 | unsigned long rlim_rtprio = |
3067 | task_rlimit(p, RLIMIT_RTPRIO); | 3306 | task_rlimit(p, RLIMIT_RTPRIO); |
@@ -3071,8 +3310,8 @@ recheck: | |||
3071 | return -EPERM; | 3310 | return -EPERM; |
3072 | 3311 | ||
3073 | /* can't increase priority */ | 3312 | /* can't increase priority */ |
3074 | if (param->sched_priority > p->rt_priority && | 3313 | if (attr->sched_priority > p->rt_priority && |
3075 | param->sched_priority > rlim_rtprio) | 3314 | attr->sched_priority > rlim_rtprio) |
3076 | return -EPERM; | 3315 | return -EPERM; |
3077 | } | 3316 | } |
3078 | 3317 | ||
@@ -3120,14 +3359,21 @@ recheck: | |||
3120 | /* | 3359 | /* |
3121 | * If not changing anything there's no need to proceed further: | 3360 | * If not changing anything there's no need to proceed further: |
3122 | */ | 3361 | */ |
3123 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 3362 | if (unlikely(policy == p->policy)) { |
3124 | param->sched_priority == p->rt_priority))) { | 3363 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) |
3364 | goto change; | ||
3365 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | ||
3366 | goto change; | ||
3367 | if (dl_policy(policy)) | ||
3368 | goto change; | ||
3369 | |||
3125 | task_rq_unlock(rq, p, &flags); | 3370 | task_rq_unlock(rq, p, &flags); |
3126 | return 0; | 3371 | return 0; |
3127 | } | 3372 | } |
3373 | change: | ||
3128 | 3374 | ||
3129 | #ifdef CONFIG_RT_GROUP_SCHED | ||
3130 | if (user) { | 3375 | if (user) { |
3376 | #ifdef CONFIG_RT_GROUP_SCHED | ||
3131 | /* | 3377 | /* |
3132 | * Do not allow realtime tasks into groups that have no runtime | 3378 | * Do not allow realtime tasks into groups that have no runtime |
3133 | * assigned. | 3379 | * assigned. |
@@ -3138,8 +3384,24 @@ recheck: | |||
3138 | task_rq_unlock(rq, p, &flags); | 3384 | task_rq_unlock(rq, p, &flags); |
3139 | return -EPERM; | 3385 | return -EPERM; |
3140 | } | 3386 | } |
3141 | } | ||
3142 | #endif | 3387 | #endif |
3388 | #ifdef CONFIG_SMP | ||
3389 | if (dl_bandwidth_enabled() && dl_policy(policy)) { | ||
3390 | cpumask_t *span = rq->rd->span; | ||
3391 | |||
3392 | /* | ||
3393 | * Don't allow tasks with an affinity mask smaller than | ||
3394 | * the entire root_domain to become SCHED_DEADLINE. We | ||
3395 | * will also fail if there's no bandwidth available. | ||
3396 | */ | ||
3397 | if (!cpumask_subset(span, &p->cpus_allowed) || | ||
3398 | rq->rd->dl_bw.bw == 0) { | ||
3399 | task_rq_unlock(rq, p, &flags); | ||
3400 | return -EPERM; | ||
3401 | } | ||
3402 | } | ||
3403 | #endif | ||
3404 | } | ||
3143 | 3405 | ||
3144 | /* recheck policy now with rq lock held */ | 3406 | /* recheck policy now with rq lock held */ |
3145 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 3407 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
@@ -3147,6 +3409,17 @@ recheck: | |||
3147 | task_rq_unlock(rq, p, &flags); | 3409 | task_rq_unlock(rq, p, &flags); |
3148 | goto recheck; | 3410 | goto recheck; |
3149 | } | 3411 | } |
3412 | |||
3413 | /* | ||
3414 | * If setscheduling to SCHED_DEADLINE (or changing the parameters | ||
3415 | * of a SCHED_DEADLINE task) we need to check if enough bandwidth | ||
3416 | * is available. | ||
3417 | */ | ||
3418 | if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { | ||
3419 | task_rq_unlock(rq, p, &flags); | ||
3420 | return -EBUSY; | ||
3421 | } | ||
3422 | |||
3150 | on_rq = p->on_rq; | 3423 | on_rq = p->on_rq; |
3151 | running = task_current(rq, p); | 3424 | running = task_current(rq, p); |
3152 | if (on_rq) | 3425 | if (on_rq) |
@@ -3158,7 +3431,7 @@ recheck: | |||
3158 | 3431 | ||
3159 | oldprio = p->prio; | 3432 | oldprio = p->prio; |
3160 | prev_class = p->sched_class; | 3433 | prev_class = p->sched_class; |
3161 | __setscheduler(rq, p, policy, param->sched_priority); | 3434 | __setscheduler(rq, p, attr); |
3162 | 3435 | ||
3163 | if (running) | 3436 | if (running) |
3164 | p->sched_class->set_curr_task(rq); | 3437 | p->sched_class->set_curr_task(rq); |
@@ -3173,6 +3446,26 @@ recheck: | |||
3173 | return 0; | 3446 | return 0; |
3174 | } | 3447 | } |
3175 | 3448 | ||
3449 | static int _sched_setscheduler(struct task_struct *p, int policy, | ||
3450 | const struct sched_param *param, bool check) | ||
3451 | { | ||
3452 | struct sched_attr attr = { | ||
3453 | .sched_policy = policy, | ||
3454 | .sched_priority = param->sched_priority, | ||
3455 | .sched_nice = PRIO_TO_NICE(p->static_prio), | ||
3456 | }; | ||
3457 | |||
3458 | /* | ||
3459 | * Fixup the legacy SCHED_RESET_ON_FORK hack | ||
3460 | */ | ||
3461 | if (policy & SCHED_RESET_ON_FORK) { | ||
3462 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
3463 | policy &= ~SCHED_RESET_ON_FORK; | ||
3464 | attr.sched_policy = policy; | ||
3465 | } | ||
3466 | |||
3467 | return __sched_setscheduler(p, &attr, check); | ||
3468 | } | ||
3176 | /** | 3469 | /** |
3177 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 3470 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
3178 | * @p: the task in question. | 3471 | * @p: the task in question. |
@@ -3186,10 +3479,16 @@ recheck: | |||
3186 | int sched_setscheduler(struct task_struct *p, int policy, | 3479 | int sched_setscheduler(struct task_struct *p, int policy, |
3187 | const struct sched_param *param) | 3480 | const struct sched_param *param) |
3188 | { | 3481 | { |
3189 | return __sched_setscheduler(p, policy, param, true); | 3482 | return _sched_setscheduler(p, policy, param, true); |
3190 | } | 3483 | } |
3191 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3484 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
3192 | 3485 | ||
3486 | int sched_setattr(struct task_struct *p, const struct sched_attr *attr) | ||
3487 | { | ||
3488 | return __sched_setscheduler(p, attr, true); | ||
3489 | } | ||
3490 | EXPORT_SYMBOL_GPL(sched_setattr); | ||
3491 | |||
3193 | /** | 3492 | /** |
3194 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | 3493 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
3195 | * @p: the task in question. | 3494 | * @p: the task in question. |
@@ -3206,7 +3505,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3206 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3505 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
3207 | const struct sched_param *param) | 3506 | const struct sched_param *param) |
3208 | { | 3507 | { |
3209 | return __sched_setscheduler(p, policy, param, false); | 3508 | return _sched_setscheduler(p, policy, param, false); |
3210 | } | 3509 | } |
3211 | 3510 | ||
3212 | static int | 3511 | static int |
@@ -3231,6 +3530,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3231 | return retval; | 3530 | return retval; |
3232 | } | 3531 | } |
3233 | 3532 | ||
3533 | /* | ||
3534 | * Mimics kernel/events/core.c perf_copy_attr(). | ||
3535 | */ | ||
3536 | static int sched_copy_attr(struct sched_attr __user *uattr, | ||
3537 | struct sched_attr *attr) | ||
3538 | { | ||
3539 | u32 size; | ||
3540 | int ret; | ||
3541 | |||
3542 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | ||
3543 | return -EFAULT; | ||
3544 | |||
3545 | /* | ||
3546 | * zero the full structure, so that a short copy will be nice. | ||
3547 | */ | ||
3548 | memset(attr, 0, sizeof(*attr)); | ||
3549 | |||
3550 | ret = get_user(size, &uattr->size); | ||
3551 | if (ret) | ||
3552 | return ret; | ||
3553 | |||
3554 | if (size > PAGE_SIZE) /* silly large */ | ||
3555 | goto err_size; | ||
3556 | |||
3557 | if (!size) /* abi compat */ | ||
3558 | size = SCHED_ATTR_SIZE_VER0; | ||
3559 | |||
3560 | if (size < SCHED_ATTR_SIZE_VER0) | ||
3561 | goto err_size; | ||
3562 | |||
3563 | /* | ||
3564 | * If we're handed a bigger struct than we know of, | ||
3565 | * ensure all the unknown bits are 0 - i.e. new | ||
3566 | * user-space does not rely on any kernel feature | ||
3567 | * extensions we dont know about yet. | ||
3568 | */ | ||
3569 | if (size > sizeof(*attr)) { | ||
3570 | unsigned char __user *addr; | ||
3571 | unsigned char __user *end; | ||
3572 | unsigned char val; | ||
3573 | |||
3574 | addr = (void __user *)uattr + sizeof(*attr); | ||
3575 | end = (void __user *)uattr + size; | ||
3576 | |||
3577 | for (; addr < end; addr++) { | ||
3578 | ret = get_user(val, addr); | ||
3579 | if (ret) | ||
3580 | return ret; | ||
3581 | if (val) | ||
3582 | goto err_size; | ||
3583 | } | ||
3584 | size = sizeof(*attr); | ||
3585 | } | ||
3586 | |||
3587 | ret = copy_from_user(attr, uattr, size); | ||
3588 | if (ret) | ||
3589 | return -EFAULT; | ||
3590 | |||
3591 | /* | ||
3592 | * XXX: do we want to be lenient like existing syscalls; or do we want | ||
3593 | * to be strict and return an error on out-of-bounds values? | ||
3594 | */ | ||
3595 | attr->sched_nice = clamp(attr->sched_nice, -20, 19); | ||
3596 | |||
3597 | out: | ||
3598 | return ret; | ||
3599 | |||
3600 | err_size: | ||
3601 | put_user(sizeof(*attr), &uattr->size); | ||
3602 | ret = -E2BIG; | ||
3603 | goto out; | ||
3604 | } | ||
3605 | |||
3234 | /** | 3606 | /** |
3235 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 3607 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
3236 | * @pid: the pid in question. | 3608 | * @pid: the pid in question. |
@@ -3262,6 +3634,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
3262 | } | 3634 | } |
3263 | 3635 | ||
3264 | /** | 3636 | /** |
3637 | * sys_sched_setattr - same as above, but with extended sched_attr | ||
3638 | * @pid: the pid in question. | ||
3639 | * @uattr: structure containing the extended parameters. | ||
3640 | */ | ||
3641 | SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) | ||
3642 | { | ||
3643 | struct sched_attr attr; | ||
3644 | struct task_struct *p; | ||
3645 | int retval; | ||
3646 | |||
3647 | if (!uattr || pid < 0) | ||
3648 | return -EINVAL; | ||
3649 | |||
3650 | if (sched_copy_attr(uattr, &attr)) | ||
3651 | return -EFAULT; | ||
3652 | |||
3653 | rcu_read_lock(); | ||
3654 | retval = -ESRCH; | ||
3655 | p = find_process_by_pid(pid); | ||
3656 | if (p != NULL) | ||
3657 | retval = sched_setattr(p, &attr); | ||
3658 | rcu_read_unlock(); | ||
3659 | |||
3660 | return retval; | ||
3661 | } | ||
3662 | |||
3663 | /** | ||
3265 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3664 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
3266 | * @pid: the pid in question. | 3665 | * @pid: the pid in question. |
3267 | * | 3666 | * |
@@ -3316,6 +3715,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
3316 | if (retval) | 3715 | if (retval) |
3317 | goto out_unlock; | 3716 | goto out_unlock; |
3318 | 3717 | ||
3718 | if (task_has_dl_policy(p)) { | ||
3719 | retval = -EINVAL; | ||
3720 | goto out_unlock; | ||
3721 | } | ||
3319 | lp.sched_priority = p->rt_priority; | 3722 | lp.sched_priority = p->rt_priority; |
3320 | rcu_read_unlock(); | 3723 | rcu_read_unlock(); |
3321 | 3724 | ||
@@ -3331,6 +3734,96 @@ out_unlock: | |||
3331 | return retval; | 3734 | return retval; |
3332 | } | 3735 | } |
3333 | 3736 | ||
3737 | static int sched_read_attr(struct sched_attr __user *uattr, | ||
3738 | struct sched_attr *attr, | ||
3739 | unsigned int usize) | ||
3740 | { | ||
3741 | int ret; | ||
3742 | |||
3743 | if (!access_ok(VERIFY_WRITE, uattr, usize)) | ||
3744 | return -EFAULT; | ||
3745 | |||
3746 | /* | ||
3747 | * If we're handed a smaller struct than we know of, | ||
3748 | * ensure all the unknown bits are 0 - i.e. old | ||
3749 | * user-space does not get uncomplete information. | ||
3750 | */ | ||
3751 | if (usize < sizeof(*attr)) { | ||
3752 | unsigned char *addr; | ||
3753 | unsigned char *end; | ||
3754 | |||
3755 | addr = (void *)attr + usize; | ||
3756 | end = (void *)attr + sizeof(*attr); | ||
3757 | |||
3758 | for (; addr < end; addr++) { | ||
3759 | if (*addr) | ||
3760 | goto err_size; | ||
3761 | } | ||
3762 | |||
3763 | attr->size = usize; | ||
3764 | } | ||
3765 | |||
3766 | ret = copy_to_user(uattr, attr, usize); | ||
3767 | if (ret) | ||
3768 | return -EFAULT; | ||
3769 | |||
3770 | out: | ||
3771 | return ret; | ||
3772 | |||
3773 | err_size: | ||
3774 | ret = -E2BIG; | ||
3775 | goto out; | ||
3776 | } | ||
3777 | |||
3778 | /** | ||
3779 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr | ||
3780 | * @pid: the pid in question. | ||
3781 | * @uattr: structure containing the extended parameters. | ||
3782 | * @size: sizeof(attr) for fwd/bwd comp. | ||
3783 | */ | ||
3784 | SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
3785 | unsigned int, size) | ||
3786 | { | ||
3787 | struct sched_attr attr = { | ||
3788 | .size = sizeof(struct sched_attr), | ||
3789 | }; | ||
3790 | struct task_struct *p; | ||
3791 | int retval; | ||
3792 | |||
3793 | if (!uattr || pid < 0 || size > PAGE_SIZE || | ||
3794 | size < SCHED_ATTR_SIZE_VER0) | ||
3795 | return -EINVAL; | ||
3796 | |||
3797 | rcu_read_lock(); | ||
3798 | p = find_process_by_pid(pid); | ||
3799 | retval = -ESRCH; | ||
3800 | if (!p) | ||
3801 | goto out_unlock; | ||
3802 | |||
3803 | retval = security_task_getscheduler(p); | ||
3804 | if (retval) | ||
3805 | goto out_unlock; | ||
3806 | |||
3807 | attr.sched_policy = p->policy; | ||
3808 | if (p->sched_reset_on_fork) | ||
3809 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
3810 | if (task_has_dl_policy(p)) | ||
3811 | __getparam_dl(p, &attr); | ||
3812 | else if (task_has_rt_policy(p)) | ||
3813 | attr.sched_priority = p->rt_priority; | ||
3814 | else | ||
3815 | attr.sched_nice = TASK_NICE(p); | ||
3816 | |||
3817 | rcu_read_unlock(); | ||
3818 | |||
3819 | retval = sched_read_attr(uattr, &attr, size); | ||
3820 | return retval; | ||
3821 | |||
3822 | out_unlock: | ||
3823 | rcu_read_unlock(); | ||
3824 | return retval; | ||
3825 | } | ||
3826 | |||
3334 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | 3827 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
3335 | { | 3828 | { |
3336 | cpumask_var_t cpus_allowed, new_mask; | 3829 | cpumask_var_t cpus_allowed, new_mask; |
@@ -3375,8 +3868,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
3375 | if (retval) | 3868 | if (retval) |
3376 | goto out_unlock; | 3869 | goto out_unlock; |
3377 | 3870 | ||
3871 | |||
3378 | cpuset_cpus_allowed(p, cpus_allowed); | 3872 | cpuset_cpus_allowed(p, cpus_allowed); |
3379 | cpumask_and(new_mask, in_mask, cpus_allowed); | 3873 | cpumask_and(new_mask, in_mask, cpus_allowed); |
3874 | |||
3875 | /* | ||
3876 | * Since bandwidth control happens on root_domain basis, | ||
3877 | * if admission test is enabled, we only admit -deadline | ||
3878 | * tasks allowed to run on all the CPUs in the task's | ||
3879 | * root_domain. | ||
3880 | */ | ||
3881 | #ifdef CONFIG_SMP | ||
3882 | if (task_has_dl_policy(p)) { | ||
3883 | const struct cpumask *span = task_rq(p)->rd->span; | ||
3884 | |||
3885 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
3886 | retval = -EBUSY; | ||
3887 | goto out_unlock; | ||
3888 | } | ||
3889 | } | ||
3890 | #endif | ||
3380 | again: | 3891 | again: |
3381 | retval = set_cpus_allowed_ptr(p, new_mask); | 3892 | retval = set_cpus_allowed_ptr(p, new_mask); |
3382 | 3893 | ||
@@ -3653,7 +4164,7 @@ again: | |||
3653 | } | 4164 | } |
3654 | 4165 | ||
3655 | double_rq_lock(rq, p_rq); | 4166 | double_rq_lock(rq, p_rq); |
3656 | while (task_rq(p) != p_rq) { | 4167 | if (task_rq(p) != p_rq) { |
3657 | double_rq_unlock(rq, p_rq); | 4168 | double_rq_unlock(rq, p_rq); |
3658 | goto again; | 4169 | goto again; |
3659 | } | 4170 | } |
@@ -3742,6 +4253,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
3742 | case SCHED_RR: | 4253 | case SCHED_RR: |
3743 | ret = MAX_USER_RT_PRIO-1; | 4254 | ret = MAX_USER_RT_PRIO-1; |
3744 | break; | 4255 | break; |
4256 | case SCHED_DEADLINE: | ||
3745 | case SCHED_NORMAL: | 4257 | case SCHED_NORMAL: |
3746 | case SCHED_BATCH: | 4258 | case SCHED_BATCH: |
3747 | case SCHED_IDLE: | 4259 | case SCHED_IDLE: |
@@ -3768,6 +4280,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
3768 | case SCHED_RR: | 4280 | case SCHED_RR: |
3769 | ret = 1; | 4281 | ret = 1; |
3770 | break; | 4282 | break; |
4283 | case SCHED_DEADLINE: | ||
3771 | case SCHED_NORMAL: | 4284 | case SCHED_NORMAL: |
3772 | case SCHED_BATCH: | 4285 | case SCHED_BATCH: |
3773 | case SCHED_IDLE: | 4286 | case SCHED_IDLE: |
@@ -4514,13 +5027,31 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
4514 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5027 | static int sched_cpu_inactive(struct notifier_block *nfb, |
4515 | unsigned long action, void *hcpu) | 5028 | unsigned long action, void *hcpu) |
4516 | { | 5029 | { |
5030 | unsigned long flags; | ||
5031 | long cpu = (long)hcpu; | ||
5032 | |||
4517 | switch (action & ~CPU_TASKS_FROZEN) { | 5033 | switch (action & ~CPU_TASKS_FROZEN) { |
4518 | case CPU_DOWN_PREPARE: | 5034 | case CPU_DOWN_PREPARE: |
4519 | set_cpu_active((long)hcpu, false); | 5035 | set_cpu_active(cpu, false); |
5036 | |||
5037 | /* explicitly allow suspend */ | ||
5038 | if (!(action & CPU_TASKS_FROZEN)) { | ||
5039 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
5040 | bool overflow; | ||
5041 | int cpus; | ||
5042 | |||
5043 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
5044 | cpus = dl_bw_cpus(cpu); | ||
5045 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
5046 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
5047 | |||
5048 | if (overflow) | ||
5049 | return notifier_from_errno(-EBUSY); | ||
5050 | } | ||
4520 | return NOTIFY_OK; | 5051 | return NOTIFY_OK; |
4521 | default: | ||
4522 | return NOTIFY_DONE; | ||
4523 | } | 5052 | } |
5053 | |||
5054 | return NOTIFY_DONE; | ||
4524 | } | 5055 | } |
4525 | 5056 | ||
4526 | static int __init migration_init(void) | 5057 | static int __init migration_init(void) |
@@ -4739,6 +5270,8 @@ static void free_rootdomain(struct rcu_head *rcu) | |||
4739 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | 5270 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
4740 | 5271 | ||
4741 | cpupri_cleanup(&rd->cpupri); | 5272 | cpupri_cleanup(&rd->cpupri); |
5273 | cpudl_cleanup(&rd->cpudl); | ||
5274 | free_cpumask_var(rd->dlo_mask); | ||
4742 | free_cpumask_var(rd->rto_mask); | 5275 | free_cpumask_var(rd->rto_mask); |
4743 | free_cpumask_var(rd->online); | 5276 | free_cpumask_var(rd->online); |
4744 | free_cpumask_var(rd->span); | 5277 | free_cpumask_var(rd->span); |
@@ -4790,8 +5323,14 @@ static int init_rootdomain(struct root_domain *rd) | |||
4790 | goto out; | 5323 | goto out; |
4791 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 5324 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
4792 | goto free_span; | 5325 | goto free_span; |
4793 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 5326 | if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) |
4794 | goto free_online; | 5327 | goto free_online; |
5328 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5329 | goto free_dlo_mask; | ||
5330 | |||
5331 | init_dl_bw(&rd->dl_bw); | ||
5332 | if (cpudl_init(&rd->cpudl) != 0) | ||
5333 | goto free_dlo_mask; | ||
4795 | 5334 | ||
4796 | if (cpupri_init(&rd->cpupri) != 0) | 5335 | if (cpupri_init(&rd->cpupri) != 0) |
4797 | goto free_rto_mask; | 5336 | goto free_rto_mask; |
@@ -4799,6 +5338,8 @@ static int init_rootdomain(struct root_domain *rd) | |||
4799 | 5338 | ||
4800 | free_rto_mask: | 5339 | free_rto_mask: |
4801 | free_cpumask_var(rd->rto_mask); | 5340 | free_cpumask_var(rd->rto_mask); |
5341 | free_dlo_mask: | ||
5342 | free_cpumask_var(rd->dlo_mask); | ||
4802 | free_online: | 5343 | free_online: |
4803 | free_cpumask_var(rd->online); | 5344 | free_cpumask_var(rd->online); |
4804 | free_span: | 5345 | free_span: |
@@ -6150,6 +6691,7 @@ void __init sched_init_smp(void) | |||
6150 | free_cpumask_var(non_isolated_cpus); | 6691 | free_cpumask_var(non_isolated_cpus); |
6151 | 6692 | ||
6152 | init_sched_rt_class(); | 6693 | init_sched_rt_class(); |
6694 | init_sched_dl_class(); | ||
6153 | } | 6695 | } |
6154 | #else | 6696 | #else |
6155 | void __init sched_init_smp(void) | 6697 | void __init sched_init_smp(void) |
@@ -6219,13 +6761,15 @@ void __init sched_init(void) | |||
6219 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6761 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
6220 | } | 6762 | } |
6221 | 6763 | ||
6764 | init_rt_bandwidth(&def_rt_bandwidth, | ||
6765 | global_rt_period(), global_rt_runtime()); | ||
6766 | init_dl_bandwidth(&def_dl_bandwidth, | ||
6767 | global_rt_period(), global_rt_runtime()); | ||
6768 | |||
6222 | #ifdef CONFIG_SMP | 6769 | #ifdef CONFIG_SMP |
6223 | init_defrootdomain(); | 6770 | init_defrootdomain(); |
6224 | #endif | 6771 | #endif |
6225 | 6772 | ||
6226 | init_rt_bandwidth(&def_rt_bandwidth, | ||
6227 | global_rt_period(), global_rt_runtime()); | ||
6228 | |||
6229 | #ifdef CONFIG_RT_GROUP_SCHED | 6773 | #ifdef CONFIG_RT_GROUP_SCHED |
6230 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 6774 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
6231 | global_rt_period(), global_rt_runtime()); | 6775 | global_rt_period(), global_rt_runtime()); |
@@ -6249,6 +6793,7 @@ void __init sched_init(void) | |||
6249 | rq->calc_load_update = jiffies + LOAD_FREQ; | 6793 | rq->calc_load_update = jiffies + LOAD_FREQ; |
6250 | init_cfs_rq(&rq->cfs); | 6794 | init_cfs_rq(&rq->cfs); |
6251 | init_rt_rq(&rq->rt, rq); | 6795 | init_rt_rq(&rq->rt, rq); |
6796 | init_dl_rq(&rq->dl, rq); | ||
6252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6797 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6253 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 6798 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
6254 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6799 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
@@ -6320,10 +6865,6 @@ void __init sched_init(void) | |||
6320 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6865 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
6321 | #endif | 6866 | #endif |
6322 | 6867 | ||
6323 | #ifdef CONFIG_RT_MUTEXES | ||
6324 | plist_head_init(&init_task.pi_waiters); | ||
6325 | #endif | ||
6326 | |||
6327 | /* | 6868 | /* |
6328 | * The boot idle thread does lazy MMU switching as well: | 6869 | * The boot idle thread does lazy MMU switching as well: |
6329 | */ | 6870 | */ |
@@ -6397,13 +6938,16 @@ EXPORT_SYMBOL(__might_sleep); | |||
6397 | static void normalize_task(struct rq *rq, struct task_struct *p) | 6938 | static void normalize_task(struct rq *rq, struct task_struct *p) |
6398 | { | 6939 | { |
6399 | const struct sched_class *prev_class = p->sched_class; | 6940 | const struct sched_class *prev_class = p->sched_class; |
6941 | struct sched_attr attr = { | ||
6942 | .sched_policy = SCHED_NORMAL, | ||
6943 | }; | ||
6400 | int old_prio = p->prio; | 6944 | int old_prio = p->prio; |
6401 | int on_rq; | 6945 | int on_rq; |
6402 | 6946 | ||
6403 | on_rq = p->on_rq; | 6947 | on_rq = p->on_rq; |
6404 | if (on_rq) | 6948 | if (on_rq) |
6405 | dequeue_task(rq, p, 0); | 6949 | dequeue_task(rq, p, 0); |
6406 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6950 | __setscheduler(rq, p, &attr); |
6407 | if (on_rq) { | 6951 | if (on_rq) { |
6408 | enqueue_task(rq, p, 0); | 6952 | enqueue_task(rq, p, 0); |
6409 | resched_task(rq->curr); | 6953 | resched_task(rq->curr); |
@@ -6433,7 +6977,7 @@ void normalize_rt_tasks(void) | |||
6433 | p->se.statistics.block_start = 0; | 6977 | p->se.statistics.block_start = 0; |
6434 | #endif | 6978 | #endif |
6435 | 6979 | ||
6436 | if (!rt_task(p)) { | 6980 | if (!dl_task(p) && !rt_task(p)) { |
6437 | /* | 6981 | /* |
6438 | * Renice negative nice level userspace | 6982 | * Renice negative nice level userspace |
6439 | * tasks back to 0: | 6983 | * tasks back to 0: |
@@ -6628,16 +7172,6 @@ void sched_move_task(struct task_struct *tsk) | |||
6628 | } | 7172 | } |
6629 | #endif /* CONFIG_CGROUP_SCHED */ | 7173 | #endif /* CONFIG_CGROUP_SCHED */ |
6630 | 7174 | ||
6631 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | ||
6632 | static unsigned long to_ratio(u64 period, u64 runtime) | ||
6633 | { | ||
6634 | if (runtime == RUNTIME_INF) | ||
6635 | return 1ULL << 20; | ||
6636 | |||
6637 | return div64_u64(runtime << 20, period); | ||
6638 | } | ||
6639 | #endif | ||
6640 | |||
6641 | #ifdef CONFIG_RT_GROUP_SCHED | 7175 | #ifdef CONFIG_RT_GROUP_SCHED |
6642 | /* | 7176 | /* |
6643 | * Ensure that the real time constraints are schedulable. | 7177 | * Ensure that the real time constraints are schedulable. |
@@ -6811,24 +7345,13 @@ static long sched_group_rt_period(struct task_group *tg) | |||
6811 | do_div(rt_period_us, NSEC_PER_USEC); | 7345 | do_div(rt_period_us, NSEC_PER_USEC); |
6812 | return rt_period_us; | 7346 | return rt_period_us; |
6813 | } | 7347 | } |
7348 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
6814 | 7349 | ||
7350 | #ifdef CONFIG_RT_GROUP_SCHED | ||
6815 | static int sched_rt_global_constraints(void) | 7351 | static int sched_rt_global_constraints(void) |
6816 | { | 7352 | { |
6817 | u64 runtime, period; | ||
6818 | int ret = 0; | 7353 | int ret = 0; |
6819 | 7354 | ||
6820 | if (sysctl_sched_rt_period <= 0) | ||
6821 | return -EINVAL; | ||
6822 | |||
6823 | runtime = global_rt_runtime(); | ||
6824 | period = global_rt_period(); | ||
6825 | |||
6826 | /* | ||
6827 | * Sanity check on the sysctl variables. | ||
6828 | */ | ||
6829 | if (runtime > period && runtime != RUNTIME_INF) | ||
6830 | return -EINVAL; | ||
6831 | |||
6832 | mutex_lock(&rt_constraints_mutex); | 7355 | mutex_lock(&rt_constraints_mutex); |
6833 | read_lock(&tasklist_lock); | 7356 | read_lock(&tasklist_lock); |
6834 | ret = __rt_schedulable(NULL, 0, 0); | 7357 | ret = __rt_schedulable(NULL, 0, 0); |
@@ -6851,17 +7374,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | |||
6851 | static int sched_rt_global_constraints(void) | 7374 | static int sched_rt_global_constraints(void) |
6852 | { | 7375 | { |
6853 | unsigned long flags; | 7376 | unsigned long flags; |
6854 | int i; | 7377 | int i, ret = 0; |
6855 | |||
6856 | if (sysctl_sched_rt_period <= 0) | ||
6857 | return -EINVAL; | ||
6858 | |||
6859 | /* | ||
6860 | * There's always some RT tasks in the root group | ||
6861 | * -- migration, kstopmachine etc.. | ||
6862 | */ | ||
6863 | if (sysctl_sched_rt_runtime == 0) | ||
6864 | return -EBUSY; | ||
6865 | 7378 | ||
6866 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 7379 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
6867 | for_each_possible_cpu(i) { | 7380 | for_each_possible_cpu(i) { |
@@ -6873,36 +7386,88 @@ static int sched_rt_global_constraints(void) | |||
6873 | } | 7386 | } |
6874 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | 7387 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); |
6875 | 7388 | ||
6876 | return 0; | 7389 | return ret; |
6877 | } | 7390 | } |
6878 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7391 | #endif /* CONFIG_RT_GROUP_SCHED */ |
6879 | 7392 | ||
6880 | int sched_rr_handler(struct ctl_table *table, int write, | 7393 | static int sched_dl_global_constraints(void) |
6881 | void __user *buffer, size_t *lenp, | ||
6882 | loff_t *ppos) | ||
6883 | { | 7394 | { |
6884 | int ret; | 7395 | u64 runtime = global_rt_runtime(); |
6885 | static DEFINE_MUTEX(mutex); | 7396 | u64 period = global_rt_period(); |
7397 | u64 new_bw = to_ratio(period, runtime); | ||
7398 | int cpu, ret = 0; | ||
6886 | 7399 | ||
6887 | mutex_lock(&mutex); | 7400 | /* |
6888 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7401 | * Here we want to check the bandwidth not being set to some |
6889 | /* make sure that internally we keep jiffies */ | 7402 | * value smaller than the currently allocated bandwidth in |
6890 | /* also, writing zero resets timeslice to default */ | 7403 | * any of the root_domains. |
6891 | if (!ret && write) { | 7404 | * |
6892 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 7405 | * FIXME: Cycling on all the CPUs is overdoing, but simpler than |
6893 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 7406 | * cycling on root_domains... Discussion on different/better |
7407 | * solutions is welcome! | ||
7408 | */ | ||
7409 | for_each_possible_cpu(cpu) { | ||
7410 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
7411 | |||
7412 | raw_spin_lock(&dl_b->lock); | ||
7413 | if (new_bw < dl_b->total_bw) | ||
7414 | ret = -EBUSY; | ||
7415 | raw_spin_unlock(&dl_b->lock); | ||
7416 | |||
7417 | if (ret) | ||
7418 | break; | ||
6894 | } | 7419 | } |
6895 | mutex_unlock(&mutex); | 7420 | |
6896 | return ret; | 7421 | return ret; |
6897 | } | 7422 | } |
6898 | 7423 | ||
7424 | static void sched_dl_do_global(void) | ||
7425 | { | ||
7426 | u64 new_bw = -1; | ||
7427 | int cpu; | ||
7428 | |||
7429 | def_dl_bandwidth.dl_period = global_rt_period(); | ||
7430 | def_dl_bandwidth.dl_runtime = global_rt_runtime(); | ||
7431 | |||
7432 | if (global_rt_runtime() != RUNTIME_INF) | ||
7433 | new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
7434 | |||
7435 | /* | ||
7436 | * FIXME: As above... | ||
7437 | */ | ||
7438 | for_each_possible_cpu(cpu) { | ||
7439 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
7440 | |||
7441 | raw_spin_lock(&dl_b->lock); | ||
7442 | dl_b->bw = new_bw; | ||
7443 | raw_spin_unlock(&dl_b->lock); | ||
7444 | } | ||
7445 | } | ||
7446 | |||
7447 | static int sched_rt_global_validate(void) | ||
7448 | { | ||
7449 | if (sysctl_sched_rt_period <= 0) | ||
7450 | return -EINVAL; | ||
7451 | |||
7452 | if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) | ||
7453 | return -EINVAL; | ||
7454 | |||
7455 | return 0; | ||
7456 | } | ||
7457 | |||
7458 | static void sched_rt_do_global(void) | ||
7459 | { | ||
7460 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
7461 | def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); | ||
7462 | } | ||
7463 | |||
6899 | int sched_rt_handler(struct ctl_table *table, int write, | 7464 | int sched_rt_handler(struct ctl_table *table, int write, |
6900 | void __user *buffer, size_t *lenp, | 7465 | void __user *buffer, size_t *lenp, |
6901 | loff_t *ppos) | 7466 | loff_t *ppos) |
6902 | { | 7467 | { |
6903 | int ret; | ||
6904 | int old_period, old_runtime; | 7468 | int old_period, old_runtime; |
6905 | static DEFINE_MUTEX(mutex); | 7469 | static DEFINE_MUTEX(mutex); |
7470 | int ret; | ||
6906 | 7471 | ||
6907 | mutex_lock(&mutex); | 7472 | mutex_lock(&mutex); |
6908 | old_period = sysctl_sched_rt_period; | 7473 | old_period = sysctl_sched_rt_period; |
@@ -6911,21 +7476,50 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
6911 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7476 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
6912 | 7477 | ||
6913 | if (!ret && write) { | 7478 | if (!ret && write) { |
7479 | ret = sched_rt_global_validate(); | ||
7480 | if (ret) | ||
7481 | goto undo; | ||
7482 | |||
6914 | ret = sched_rt_global_constraints(); | 7483 | ret = sched_rt_global_constraints(); |
6915 | if (ret) { | 7484 | if (ret) |
6916 | sysctl_sched_rt_period = old_period; | 7485 | goto undo; |
6917 | sysctl_sched_rt_runtime = old_runtime; | 7486 | |
6918 | } else { | 7487 | ret = sched_dl_global_constraints(); |
6919 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | 7488 | if (ret) |
6920 | def_rt_bandwidth.rt_period = | 7489 | goto undo; |
6921 | ns_to_ktime(global_rt_period()); | 7490 | |
6922 | } | 7491 | sched_rt_do_global(); |
7492 | sched_dl_do_global(); | ||
7493 | } | ||
7494 | if (0) { | ||
7495 | undo: | ||
7496 | sysctl_sched_rt_period = old_period; | ||
7497 | sysctl_sched_rt_runtime = old_runtime; | ||
6923 | } | 7498 | } |
6924 | mutex_unlock(&mutex); | 7499 | mutex_unlock(&mutex); |
6925 | 7500 | ||
6926 | return ret; | 7501 | return ret; |
6927 | } | 7502 | } |
6928 | 7503 | ||
7504 | int sched_rr_handler(struct ctl_table *table, int write, | ||
7505 | void __user *buffer, size_t *lenp, | ||
7506 | loff_t *ppos) | ||
7507 | { | ||
7508 | int ret; | ||
7509 | static DEFINE_MUTEX(mutex); | ||
7510 | |||
7511 | mutex_lock(&mutex); | ||
7512 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
7513 | /* make sure that internally we keep jiffies */ | ||
7514 | /* also, writing zero resets timeslice to default */ | ||
7515 | if (!ret && write) { | ||
7516 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | ||
7517 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | ||
7518 | } | ||
7519 | mutex_unlock(&mutex); | ||
7520 | return ret; | ||
7521 | } | ||
7522 | |||
6929 | #ifdef CONFIG_CGROUP_SCHED | 7523 | #ifdef CONFIG_CGROUP_SCHED |
6930 | 7524 | ||
6931 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) | 7525 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..045fc74e3f09 --- /dev/null +++ b/kernel/sched/cpudeadline.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * kernel/sched/cpudl.c | ||
3 | * | ||
4 | * Global CPU deadline management | ||
5 | * | ||
6 | * Author: Juri Lelli <j.lelli@sssup.it> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; version 2 | ||
11 | * of the License. | ||
12 | */ | ||
13 | |||
14 | #include <linux/gfp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include "cpudeadline.h" | ||
17 | |||
18 | static inline int parent(int i) | ||
19 | { | ||
20 | return (i - 1) >> 1; | ||
21 | } | ||
22 | |||
23 | static inline int left_child(int i) | ||
24 | { | ||
25 | return (i << 1) + 1; | ||
26 | } | ||
27 | |||
28 | static inline int right_child(int i) | ||
29 | { | ||
30 | return (i << 1) + 2; | ||
31 | } | ||
32 | |||
33 | static inline int dl_time_before(u64 a, u64 b) | ||
34 | { | ||
35 | return (s64)(a - b) < 0; | ||
36 | } | ||
37 | |||
38 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | ||
39 | { | ||
40 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | ||
41 | |||
42 | swap(cp->elements[a], cp->elements[b]); | ||
43 | swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); | ||
44 | } | ||
45 | |||
46 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
47 | { | ||
48 | int l, r, largest; | ||
49 | |||
50 | /* adapted from lib/prio_heap.c */ | ||
51 | while(1) { | ||
52 | l = left_child(idx); | ||
53 | r = right_child(idx); | ||
54 | largest = idx; | ||
55 | |||
56 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | ||
57 | cp->elements[l].dl)) | ||
58 | largest = l; | ||
59 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | ||
60 | cp->elements[r].dl)) | ||
61 | largest = r; | ||
62 | if (largest == idx) | ||
63 | break; | ||
64 | |||
65 | /* Push idx down the heap one level and bump one up */ | ||
66 | cpudl_exchange(cp, largest, idx); | ||
67 | idx = largest; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | ||
72 | { | ||
73 | WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); | ||
74 | |||
75 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | ||
76 | cp->elements[idx].dl = new_dl; | ||
77 | cpudl_heapify(cp, idx); | ||
78 | } else { | ||
79 | cp->elements[idx].dl = new_dl; | ||
80 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
81 | cp->elements[idx].dl)) { | ||
82 | cpudl_exchange(cp, idx, parent(idx)); | ||
83 | idx = parent(idx); | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | static inline int cpudl_maximum(struct cpudl *cp) | ||
89 | { | ||
90 | return cp->elements[0].cpu; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * cpudl_find - find the best (later-dl) CPU in the system | ||
95 | * @cp: the cpudl max-heap context | ||
96 | * @p: the task | ||
97 | * @later_mask: a mask to fill in with the selected CPUs (or NULL) | ||
98 | * | ||
99 | * Returns: int - best CPU (heap maximum if suitable) | ||
100 | */ | ||
101 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
102 | struct cpumask *later_mask) | ||
103 | { | ||
104 | int best_cpu = -1; | ||
105 | const struct sched_dl_entity *dl_se = &p->dl; | ||
106 | |||
107 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | ||
108 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
109 | later_mask, cpu_active_mask)) { | ||
110 | best_cpu = cpumask_any(later_mask); | ||
111 | goto out; | ||
112 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | ||
113 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | ||
114 | best_cpu = cpudl_maximum(cp); | ||
115 | if (later_mask) | ||
116 | cpumask_set_cpu(best_cpu, later_mask); | ||
117 | } | ||
118 | |||
119 | out: | ||
120 | WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); | ||
121 | |||
122 | return best_cpu; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * cpudl_set - update the cpudl max-heap | ||
127 | * @cp: the cpudl max-heap context | ||
128 | * @cpu: the target cpu | ||
129 | * @dl: the new earliest deadline for this cpu | ||
130 | * | ||
131 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
132 | * | ||
133 | * Returns: (void) | ||
134 | */ | ||
135 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | ||
136 | { | ||
137 | int old_idx, new_cpu; | ||
138 | unsigned long flags; | ||
139 | |||
140 | WARN_ON(cpu > num_present_cpus()); | ||
141 | |||
142 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
143 | old_idx = cp->cpu_to_idx[cpu]; | ||
144 | if (!is_valid) { | ||
145 | /* remove item */ | ||
146 | if (old_idx == IDX_INVALID) { | ||
147 | /* | ||
148 | * Nothing to remove if old_idx was invalid. | ||
149 | * This could happen if a rq_offline_dl is | ||
150 | * called for a CPU without -dl tasks running. | ||
151 | */ | ||
152 | goto out; | ||
153 | } | ||
154 | new_cpu = cp->elements[cp->size - 1].cpu; | ||
155 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | ||
156 | cp->elements[old_idx].cpu = new_cpu; | ||
157 | cp->size--; | ||
158 | cp->cpu_to_idx[new_cpu] = old_idx; | ||
159 | cp->cpu_to_idx[cpu] = IDX_INVALID; | ||
160 | while (old_idx > 0 && dl_time_before( | ||
161 | cp->elements[parent(old_idx)].dl, | ||
162 | cp->elements[old_idx].dl)) { | ||
163 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
164 | old_idx = parent(old_idx); | ||
165 | } | ||
166 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
167 | cpudl_heapify(cp, old_idx); | ||
168 | |||
169 | goto out; | ||
170 | } | ||
171 | |||
172 | if (old_idx == IDX_INVALID) { | ||
173 | cp->size++; | ||
174 | cp->elements[cp->size - 1].dl = 0; | ||
175 | cp->elements[cp->size - 1].cpu = cpu; | ||
176 | cp->cpu_to_idx[cpu] = cp->size - 1; | ||
177 | cpudl_change_key(cp, cp->size - 1, dl); | ||
178 | cpumask_clear_cpu(cpu, cp->free_cpus); | ||
179 | } else { | ||
180 | cpudl_change_key(cp, old_idx, dl); | ||
181 | } | ||
182 | |||
183 | out: | ||
184 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * cpudl_init - initialize the cpudl structure | ||
189 | * @cp: the cpudl max-heap context | ||
190 | */ | ||
191 | int cpudl_init(struct cpudl *cp) | ||
192 | { | ||
193 | int i; | ||
194 | |||
195 | memset(cp, 0, sizeof(*cp)); | ||
196 | raw_spin_lock_init(&cp->lock); | ||
197 | cp->size = 0; | ||
198 | for (i = 0; i < NR_CPUS; i++) | ||
199 | cp->cpu_to_idx[i] = IDX_INVALID; | ||
200 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) | ||
201 | return -ENOMEM; | ||
202 | cpumask_setall(cp->free_cpus); | ||
203 | |||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * cpudl_cleanup - clean up the cpudl structure | ||
209 | * @cp: the cpudl max-heap context | ||
210 | */ | ||
211 | void cpudl_cleanup(struct cpudl *cp) | ||
212 | { | ||
213 | /* | ||
214 | * nothing to do for the moment | ||
215 | */ | ||
216 | } | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h | |||
@@ -0,0 +1,33 @@ | |||
1 | #ifndef _LINUX_CPUDL_H | ||
2 | #define _LINUX_CPUDL_H | ||
3 | |||
4 | #include <linux/sched.h> | ||
5 | |||
6 | #define IDX_INVALID -1 | ||
7 | |||
8 | struct array_item { | ||
9 | u64 dl; | ||
10 | int cpu; | ||
11 | }; | ||
12 | |||
13 | struct cpudl { | ||
14 | raw_spinlock_t lock; | ||
15 | int size; | ||
16 | int cpu_to_idx[NR_CPUS]; | ||
17 | struct array_item elements[NR_CPUS]; | ||
18 | cpumask_var_t free_cpus; | ||
19 | }; | ||
20 | |||
21 | |||
22 | #ifdef CONFIG_SMP | ||
23 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
24 | struct cpumask *later_mask); | ||
25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | ||
26 | int cpudl_init(struct cpudl *cp); | ||
27 | void cpudl_cleanup(struct cpudl *cp); | ||
28 | #else | ||
29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
30 | #define cpudl_init() do { } while (0) | ||
31 | #endif /* CONFIG_SMP */ | ||
32 | |||
33 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..0de248202879 --- /dev/null +++ b/kernel/sched/deadline.c | |||
@@ -0,0 +1,1640 @@ | |||
1 | /* | ||
2 | * Deadline Scheduling Class (SCHED_DEADLINE) | ||
3 | * | ||
4 | * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). | ||
5 | * | ||
6 | * Tasks that periodically executes their instances for less than their | ||
7 | * runtime won't miss any of their deadlines. | ||
8 | * Tasks that are not periodic or sporadic or that tries to execute more | ||
9 | * than their reserved bandwidth will be slowed down (and may potentially | ||
10 | * miss some of their deadlines), and won't affect any other task. | ||
11 | * | ||
12 | * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, | ||
13 | * Juri Lelli <juri.lelli@gmail.com>, | ||
14 | * Michael Trimarchi <michael@amarulasolutions.com>, | ||
15 | * Fabio Checconi <fchecconi@gmail.com> | ||
16 | */ | ||
17 | #include "sched.h" | ||
18 | |||
19 | #include <linux/slab.h> | ||
20 | |||
21 | struct dl_bandwidth def_dl_bandwidth; | ||
22 | |||
23 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | ||
24 | { | ||
25 | return container_of(dl_se, struct task_struct, dl); | ||
26 | } | ||
27 | |||
28 | static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) | ||
29 | { | ||
30 | return container_of(dl_rq, struct rq, dl); | ||
31 | } | ||
32 | |||
33 | static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) | ||
34 | { | ||
35 | struct task_struct *p = dl_task_of(dl_se); | ||
36 | struct rq *rq = task_rq(p); | ||
37 | |||
38 | return &rq->dl; | ||
39 | } | ||
40 | |||
41 | static inline int on_dl_rq(struct sched_dl_entity *dl_se) | ||
42 | { | ||
43 | return !RB_EMPTY_NODE(&dl_se->rb_node); | ||
44 | } | ||
45 | |||
46 | static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) | ||
47 | { | ||
48 | struct sched_dl_entity *dl_se = &p->dl; | ||
49 | |||
50 | return dl_rq->rb_leftmost == &dl_se->rb_node; | ||
51 | } | ||
52 | |||
53 | void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) | ||
54 | { | ||
55 | raw_spin_lock_init(&dl_b->dl_runtime_lock); | ||
56 | dl_b->dl_period = period; | ||
57 | dl_b->dl_runtime = runtime; | ||
58 | } | ||
59 | |||
60 | extern unsigned long to_ratio(u64 period, u64 runtime); | ||
61 | |||
62 | void init_dl_bw(struct dl_bw *dl_b) | ||
63 | { | ||
64 | raw_spin_lock_init(&dl_b->lock); | ||
65 | raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); | ||
66 | if (global_rt_runtime() == RUNTIME_INF) | ||
67 | dl_b->bw = -1; | ||
68 | else | ||
69 | dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
70 | raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); | ||
71 | dl_b->total_bw = 0; | ||
72 | } | ||
73 | |||
74 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | ||
75 | { | ||
76 | dl_rq->rb_root = RB_ROOT; | ||
77 | |||
78 | #ifdef CONFIG_SMP | ||
79 | /* zero means no -deadline tasks */ | ||
80 | dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; | ||
81 | |||
82 | dl_rq->dl_nr_migratory = 0; | ||
83 | dl_rq->overloaded = 0; | ||
84 | dl_rq->pushable_dl_tasks_root = RB_ROOT; | ||
85 | #else | ||
86 | init_dl_bw(&dl_rq->dl_bw); | ||
87 | #endif | ||
88 | } | ||
89 | |||
90 | #ifdef CONFIG_SMP | ||
91 | |||
92 | static inline int dl_overloaded(struct rq *rq) | ||
93 | { | ||
94 | return atomic_read(&rq->rd->dlo_count); | ||
95 | } | ||
96 | |||
97 | static inline void dl_set_overload(struct rq *rq) | ||
98 | { | ||
99 | if (!rq->online) | ||
100 | return; | ||
101 | |||
102 | cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); | ||
103 | /* | ||
104 | * Must be visible before the overload count is | ||
105 | * set (as in sched_rt.c). | ||
106 | * | ||
107 | * Matched by the barrier in pull_dl_task(). | ||
108 | */ | ||
109 | smp_wmb(); | ||
110 | atomic_inc(&rq->rd->dlo_count); | ||
111 | } | ||
112 | |||
113 | static inline void dl_clear_overload(struct rq *rq) | ||
114 | { | ||
115 | if (!rq->online) | ||
116 | return; | ||
117 | |||
118 | atomic_dec(&rq->rd->dlo_count); | ||
119 | cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); | ||
120 | } | ||
121 | |||
122 | static void update_dl_migration(struct dl_rq *dl_rq) | ||
123 | { | ||
124 | if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { | ||
125 | if (!dl_rq->overloaded) { | ||
126 | dl_set_overload(rq_of_dl_rq(dl_rq)); | ||
127 | dl_rq->overloaded = 1; | ||
128 | } | ||
129 | } else if (dl_rq->overloaded) { | ||
130 | dl_clear_overload(rq_of_dl_rq(dl_rq)); | ||
131 | dl_rq->overloaded = 0; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
136 | { | ||
137 | struct task_struct *p = dl_task_of(dl_se); | ||
138 | dl_rq = &rq_of_dl_rq(dl_rq)->dl; | ||
139 | |||
140 | dl_rq->dl_nr_total++; | ||
141 | if (p->nr_cpus_allowed > 1) | ||
142 | dl_rq->dl_nr_migratory++; | ||
143 | |||
144 | update_dl_migration(dl_rq); | ||
145 | } | ||
146 | |||
147 | static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
148 | { | ||
149 | struct task_struct *p = dl_task_of(dl_se); | ||
150 | dl_rq = &rq_of_dl_rq(dl_rq)->dl; | ||
151 | |||
152 | dl_rq->dl_nr_total--; | ||
153 | if (p->nr_cpus_allowed > 1) | ||
154 | dl_rq->dl_nr_migratory--; | ||
155 | |||
156 | update_dl_migration(dl_rq); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * The list of pushable -deadline task is not a plist, like in | ||
161 | * sched_rt.c, it is an rb-tree with tasks ordered by deadline. | ||
162 | */ | ||
163 | static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
164 | { | ||
165 | struct dl_rq *dl_rq = &rq->dl; | ||
166 | struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; | ||
167 | struct rb_node *parent = NULL; | ||
168 | struct task_struct *entry; | ||
169 | int leftmost = 1; | ||
170 | |||
171 | BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); | ||
172 | |||
173 | while (*link) { | ||
174 | parent = *link; | ||
175 | entry = rb_entry(parent, struct task_struct, | ||
176 | pushable_dl_tasks); | ||
177 | if (dl_entity_preempt(&p->dl, &entry->dl)) | ||
178 | link = &parent->rb_left; | ||
179 | else { | ||
180 | link = &parent->rb_right; | ||
181 | leftmost = 0; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | if (leftmost) | ||
186 | dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; | ||
187 | |||
188 | rb_link_node(&p->pushable_dl_tasks, parent, link); | ||
189 | rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
190 | } | ||
191 | |||
192 | static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
193 | { | ||
194 | struct dl_rq *dl_rq = &rq->dl; | ||
195 | |||
196 | if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) | ||
197 | return; | ||
198 | |||
199 | if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { | ||
200 | struct rb_node *next_node; | ||
201 | |||
202 | next_node = rb_next(&p->pushable_dl_tasks); | ||
203 | dl_rq->pushable_dl_tasks_leftmost = next_node; | ||
204 | } | ||
205 | |||
206 | rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
207 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
208 | } | ||
209 | |||
210 | static inline int has_pushable_dl_tasks(struct rq *rq) | ||
211 | { | ||
212 | return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); | ||
213 | } | ||
214 | |||
215 | static int push_dl_task(struct rq *rq); | ||
216 | |||
217 | #else | ||
218 | |||
219 | static inline | ||
220 | void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
221 | { | ||
222 | } | ||
223 | |||
224 | static inline | ||
225 | void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
226 | { | ||
227 | } | ||
228 | |||
229 | static inline | ||
230 | void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
231 | { | ||
232 | } | ||
233 | |||
234 | static inline | ||
235 | void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
236 | { | ||
237 | } | ||
238 | |||
239 | #endif /* CONFIG_SMP */ | ||
240 | |||
241 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
242 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
243 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
244 | int flags); | ||
245 | |||
246 | /* | ||
247 | * We are being explicitly informed that a new instance is starting, | ||
248 | * and this means that: | ||
249 | * - the absolute deadline of the entity has to be placed at | ||
250 | * current time + relative deadline; | ||
251 | * - the runtime of the entity has to be set to the maximum value. | ||
252 | * | ||
253 | * The capability of specifying such event is useful whenever a -deadline | ||
254 | * entity wants to (try to!) synchronize its behaviour with the scheduler's | ||
255 | * one, and to (try to!) reconcile itself with its own scheduling | ||
256 | * parameters. | ||
257 | */ | ||
258 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | ||
259 | struct sched_dl_entity *pi_se) | ||
260 | { | ||
261 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
262 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
263 | |||
264 | WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); | ||
265 | |||
266 | /* | ||
267 | * We use the regular wall clock time to set deadlines in the | ||
268 | * future; in fact, we must consider execution overheads (time | ||
269 | * spent on hardirq context, etc.). | ||
270 | */ | ||
271 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
272 | dl_se->runtime = pi_se->dl_runtime; | ||
273 | dl_se->dl_new = 0; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Pure Earliest Deadline First (EDF) scheduling does not deal with the | ||
278 | * possibility of a entity lasting more than what it declared, and thus | ||
279 | * exhausting its runtime. | ||
280 | * | ||
281 | * Here we are interested in making runtime overrun possible, but we do | ||
282 | * not want a entity which is misbehaving to affect the scheduling of all | ||
283 | * other entities. | ||
284 | * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) | ||
285 | * is used, in order to confine each entity within its own bandwidth. | ||
286 | * | ||
287 | * This function deals exactly with that, and ensures that when the runtime | ||
288 | * of a entity is replenished, its deadline is also postponed. That ensures | ||
289 | * the overrunning entity can't interfere with other entity in the system and | ||
290 | * can't make them miss their deadlines. Reasons why this kind of overruns | ||
291 | * could happen are, typically, a entity voluntarily trying to overcome its | ||
292 | * runtime, or it just underestimated it during sched_setscheduler_ex(). | ||
293 | */ | ||
294 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, | ||
295 | struct sched_dl_entity *pi_se) | ||
296 | { | ||
297 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
298 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
299 | |||
300 | BUG_ON(pi_se->dl_runtime <= 0); | ||
301 | |||
302 | /* | ||
303 | * This could be the case for a !-dl task that is boosted. | ||
304 | * Just go with full inherited parameters. | ||
305 | */ | ||
306 | if (dl_se->dl_deadline == 0) { | ||
307 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
308 | dl_se->runtime = pi_se->dl_runtime; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * We keep moving the deadline away until we get some | ||
313 | * available runtime for the entity. This ensures correct | ||
314 | * handling of situations where the runtime overrun is | ||
315 | * arbitrary large. | ||
316 | */ | ||
317 | while (dl_se->runtime <= 0) { | ||
318 | dl_se->deadline += pi_se->dl_period; | ||
319 | dl_se->runtime += pi_se->dl_runtime; | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * At this point, the deadline really should be "in | ||
324 | * the future" with respect to rq->clock. If it's | ||
325 | * not, we are, for some reason, lagging too much! | ||
326 | * Anyway, after having warn userspace abut that, | ||
327 | * we still try to keep the things running by | ||
328 | * resetting the deadline and the budget of the | ||
329 | * entity. | ||
330 | */ | ||
331 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | ||
332 | static bool lag_once = false; | ||
333 | |||
334 | if (!lag_once) { | ||
335 | lag_once = true; | ||
336 | printk_sched("sched: DL replenish lagged to much\n"); | ||
337 | } | ||
338 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
339 | dl_se->runtime = pi_se->dl_runtime; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * Here we check if --at time t-- an entity (which is probably being | ||
345 | * [re]activated or, in general, enqueued) can use its remaining runtime | ||
346 | * and its current deadline _without_ exceeding the bandwidth it is | ||
347 | * assigned (function returns true if it can't). We are in fact applying | ||
348 | * one of the CBS rules: when a task wakes up, if the residual runtime | ||
349 | * over residual deadline fits within the allocated bandwidth, then we | ||
350 | * can keep the current (absolute) deadline and residual budget without | ||
351 | * disrupting the schedulability of the system. Otherwise, we should | ||
352 | * refill the runtime and set the deadline a period in the future, | ||
353 | * because keeping the current (absolute) deadline of the task would | ||
354 | * result in breaking guarantees promised to other tasks. | ||
355 | * | ||
356 | * This function returns true if: | ||
357 | * | ||
358 | * runtime / (deadline - t) > dl_runtime / dl_period , | ||
359 | * | ||
360 | * IOW we can't recycle current parameters. | ||
361 | * | ||
362 | * Notice that the bandwidth check is done against the period. For | ||
363 | * task with deadline equal to period this is the same of using | ||
364 | * dl_deadline instead of dl_period in the equation above. | ||
365 | */ | ||
366 | static bool dl_entity_overflow(struct sched_dl_entity *dl_se, | ||
367 | struct sched_dl_entity *pi_se, u64 t) | ||
368 | { | ||
369 | u64 left, right; | ||
370 | |||
371 | /* | ||
372 | * left and right are the two sides of the equation above, | ||
373 | * after a bit of shuffling to use multiplications instead | ||
374 | * of divisions. | ||
375 | * | ||
376 | * Note that none of the time values involved in the two | ||
377 | * multiplications are absolute: dl_deadline and dl_runtime | ||
378 | * are the relative deadline and the maximum runtime of each | ||
379 | * instance, runtime is the runtime left for the last instance | ||
380 | * and (deadline - t), since t is rq->clock, is the time left | ||
381 | * to the (absolute) deadline. Even if overflowing the u64 type | ||
382 | * is very unlikely to occur in both cases, here we scale down | ||
383 | * as we want to avoid that risk at all. Scaling down by 10 | ||
384 | * means that we reduce granularity to 1us. We are fine with it, | ||
385 | * since this is only a true/false check and, anyway, thinking | ||
386 | * of anything below microseconds resolution is actually fiction | ||
387 | * (but still we want to give the user that illusion >;). | ||
388 | */ | ||
389 | left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); | ||
390 | right = ((dl_se->deadline - t) >> DL_SCALE) * | ||
391 | (pi_se->dl_runtime >> DL_SCALE); | ||
392 | |||
393 | return dl_time_before(right, left); | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * When a -deadline entity is queued back on the runqueue, its runtime and | ||
398 | * deadline might need updating. | ||
399 | * | ||
400 | * The policy here is that we update the deadline of the entity only if: | ||
401 | * - the current deadline is in the past, | ||
402 | * - using the remaining runtime with the current deadline would make | ||
403 | * the entity exceed its bandwidth. | ||
404 | */ | ||
405 | static void update_dl_entity(struct sched_dl_entity *dl_se, | ||
406 | struct sched_dl_entity *pi_se) | ||
407 | { | ||
408 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
409 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
410 | |||
411 | /* | ||
412 | * The arrival of a new instance needs special treatment, i.e., | ||
413 | * the actual scheduling parameters have to be "renewed". | ||
414 | */ | ||
415 | if (dl_se->dl_new) { | ||
416 | setup_new_dl_entity(dl_se, pi_se); | ||
417 | return; | ||
418 | } | ||
419 | |||
420 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | ||
421 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | ||
422 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
423 | dl_se->runtime = pi_se->dl_runtime; | ||
424 | } | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * If the entity depleted all its runtime, and if we want it to sleep | ||
429 | * while waiting for some new execution time to become available, we | ||
430 | * set the bandwidth enforcement timer to the replenishment instant | ||
431 | * and try to activate it. | ||
432 | * | ||
433 | * Notice that it is important for the caller to know if the timer | ||
434 | * actually started or not (i.e., the replenishment instant is in | ||
435 | * the future or in the past). | ||
436 | */ | ||
437 | static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) | ||
438 | { | ||
439 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
440 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
441 | ktime_t now, act; | ||
442 | ktime_t soft, hard; | ||
443 | unsigned long range; | ||
444 | s64 delta; | ||
445 | |||
446 | if (boosted) | ||
447 | return 0; | ||
448 | /* | ||
449 | * We want the timer to fire at the deadline, but considering | ||
450 | * that it is actually coming from rq->clock and not from | ||
451 | * hrtimer's time base reading. | ||
452 | */ | ||
453 | act = ns_to_ktime(dl_se->deadline); | ||
454 | now = hrtimer_cb_get_time(&dl_se->dl_timer); | ||
455 | delta = ktime_to_ns(now) - rq_clock(rq); | ||
456 | act = ktime_add_ns(act, delta); | ||
457 | |||
458 | /* | ||
459 | * If the expiry time already passed, e.g., because the value | ||
460 | * chosen as the deadline is too small, don't even try to | ||
461 | * start the timer in the past! | ||
462 | */ | ||
463 | if (ktime_us_delta(act, now) < 0) | ||
464 | return 0; | ||
465 | |||
466 | hrtimer_set_expires(&dl_se->dl_timer, act); | ||
467 | |||
468 | soft = hrtimer_get_softexpires(&dl_se->dl_timer); | ||
469 | hard = hrtimer_get_expires(&dl_se->dl_timer); | ||
470 | range = ktime_to_ns(ktime_sub(hard, soft)); | ||
471 | __hrtimer_start_range_ns(&dl_se->dl_timer, soft, | ||
472 | range, HRTIMER_MODE_ABS, 0); | ||
473 | |||
474 | return hrtimer_active(&dl_se->dl_timer); | ||
475 | } | ||
476 | |||
477 | /* | ||
478 | * This is the bandwidth enforcement timer callback. If here, we know | ||
479 | * a task is not on its dl_rq, since the fact that the timer was running | ||
480 | * means the task is throttled and needs a runtime replenishment. | ||
481 | * | ||
482 | * However, what we actually do depends on the fact the task is active, | ||
483 | * (it is on its rq) or has been removed from there by a call to | ||
484 | * dequeue_task_dl(). In the former case we must issue the runtime | ||
485 | * replenishment and add the task back to the dl_rq; in the latter, we just | ||
486 | * do nothing but clearing dl_throttled, so that runtime and deadline | ||
487 | * updating (and the queueing back to dl_rq) will be done by the | ||
488 | * next call to enqueue_task_dl(). | ||
489 | */ | ||
490 | static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | ||
491 | { | ||
492 | struct sched_dl_entity *dl_se = container_of(timer, | ||
493 | struct sched_dl_entity, | ||
494 | dl_timer); | ||
495 | struct task_struct *p = dl_task_of(dl_se); | ||
496 | struct rq *rq = task_rq(p); | ||
497 | raw_spin_lock(&rq->lock); | ||
498 | |||
499 | /* | ||
500 | * We need to take care of a possible races here. In fact, the | ||
501 | * task might have changed its scheduling policy to something | ||
502 | * different from SCHED_DEADLINE or changed its reservation | ||
503 | * parameters (through sched_setscheduler()). | ||
504 | */ | ||
505 | if (!dl_task(p) || dl_se->dl_new) | ||
506 | goto unlock; | ||
507 | |||
508 | sched_clock_tick(); | ||
509 | update_rq_clock(rq); | ||
510 | dl_se->dl_throttled = 0; | ||
511 | if (p->on_rq) { | ||
512 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
513 | if (task_has_dl_policy(rq->curr)) | ||
514 | check_preempt_curr_dl(rq, p, 0); | ||
515 | else | ||
516 | resched_task(rq->curr); | ||
517 | #ifdef CONFIG_SMP | ||
518 | /* | ||
519 | * Queueing this task back might have overloaded rq, | ||
520 | * check if we need to kick someone away. | ||
521 | */ | ||
522 | if (has_pushable_dl_tasks(rq)) | ||
523 | push_dl_task(rq); | ||
524 | #endif | ||
525 | } | ||
526 | unlock: | ||
527 | raw_spin_unlock(&rq->lock); | ||
528 | |||
529 | return HRTIMER_NORESTART; | ||
530 | } | ||
531 | |||
532 | void init_dl_task_timer(struct sched_dl_entity *dl_se) | ||
533 | { | ||
534 | struct hrtimer *timer = &dl_se->dl_timer; | ||
535 | |||
536 | if (hrtimer_active(timer)) { | ||
537 | hrtimer_try_to_cancel(timer); | ||
538 | return; | ||
539 | } | ||
540 | |||
541 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
542 | timer->function = dl_task_timer; | ||
543 | } | ||
544 | |||
545 | static | ||
546 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | ||
547 | { | ||
548 | int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); | ||
549 | int rorun = dl_se->runtime <= 0; | ||
550 | |||
551 | if (!rorun && !dmiss) | ||
552 | return 0; | ||
553 | |||
554 | /* | ||
555 | * If we are beyond our current deadline and we are still | ||
556 | * executing, then we have already used some of the runtime of | ||
557 | * the next instance. Thus, if we do not account that, we are | ||
558 | * stealing bandwidth from the system at each deadline miss! | ||
559 | */ | ||
560 | if (dmiss) { | ||
561 | dl_se->runtime = rorun ? dl_se->runtime : 0; | ||
562 | dl_se->runtime -= rq_clock(rq) - dl_se->deadline; | ||
563 | } | ||
564 | |||
565 | return 1; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Update the current task's runtime statistics (provided it is still | ||
570 | * a -deadline task and has not been removed from the dl_rq). | ||
571 | */ | ||
572 | static void update_curr_dl(struct rq *rq) | ||
573 | { | ||
574 | struct task_struct *curr = rq->curr; | ||
575 | struct sched_dl_entity *dl_se = &curr->dl; | ||
576 | u64 delta_exec; | ||
577 | |||
578 | if (!dl_task(curr) || !on_dl_rq(dl_se)) | ||
579 | return; | ||
580 | |||
581 | /* | ||
582 | * Consumed budget is computed considering the time as | ||
583 | * observed by schedulable tasks (excluding time spent | ||
584 | * in hardirq context, etc.). Deadlines are instead | ||
585 | * computed using hard walltime. This seems to be the more | ||
586 | * natural solution, but the full ramifications of this | ||
587 | * approach need further study. | ||
588 | */ | ||
589 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | ||
590 | if (unlikely((s64)delta_exec < 0)) | ||
591 | delta_exec = 0; | ||
592 | |||
593 | schedstat_set(curr->se.statistics.exec_max, | ||
594 | max(curr->se.statistics.exec_max, delta_exec)); | ||
595 | |||
596 | curr->se.sum_exec_runtime += delta_exec; | ||
597 | account_group_exec_runtime(curr, delta_exec); | ||
598 | |||
599 | curr->se.exec_start = rq_clock_task(rq); | ||
600 | cpuacct_charge(curr, delta_exec); | ||
601 | |||
602 | sched_rt_avg_update(rq, delta_exec); | ||
603 | |||
604 | dl_se->runtime -= delta_exec; | ||
605 | if (dl_runtime_exceeded(rq, dl_se)) { | ||
606 | __dequeue_task_dl(rq, curr, 0); | ||
607 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | ||
608 | dl_se->dl_throttled = 1; | ||
609 | else | ||
610 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | ||
611 | |||
612 | if (!is_leftmost(curr, &rq->dl)) | ||
613 | resched_task(curr); | ||
614 | } | ||
615 | |||
616 | /* | ||
617 | * Because -- for now -- we share the rt bandwidth, we need to | ||
618 | * account our runtime there too, otherwise actual rt tasks | ||
619 | * would be able to exceed the shared quota. | ||
620 | * | ||
621 | * Account to the root rt group for now. | ||
622 | * | ||
623 | * The solution we're working towards is having the RT groups scheduled | ||
624 | * using deadline servers -- however there's a few nasties to figure | ||
625 | * out before that can happen. | ||
626 | */ | ||
627 | if (rt_bandwidth_enabled()) { | ||
628 | struct rt_rq *rt_rq = &rq->rt; | ||
629 | |||
630 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
631 | rt_rq->rt_time += delta_exec; | ||
632 | /* | ||
633 | * We'll let actual RT tasks worry about the overflow here, we | ||
634 | * have our own CBS to keep us inline -- see above. | ||
635 | */ | ||
636 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
637 | } | ||
638 | } | ||
639 | |||
640 | #ifdef CONFIG_SMP | ||
641 | |||
642 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); | ||
643 | |||
644 | static inline u64 next_deadline(struct rq *rq) | ||
645 | { | ||
646 | struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); | ||
647 | |||
648 | if (next && dl_prio(next->prio)) | ||
649 | return next->dl.deadline; | ||
650 | else | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
655 | { | ||
656 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
657 | |||
658 | if (dl_rq->earliest_dl.curr == 0 || | ||
659 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | ||
660 | /* | ||
661 | * If the dl_rq had no -deadline tasks, or if the new task | ||
662 | * has shorter deadline than the current one on dl_rq, we | ||
663 | * know that the previous earliest becomes our next earliest, | ||
664 | * as the new task becomes the earliest itself. | ||
665 | */ | ||
666 | dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; | ||
667 | dl_rq->earliest_dl.curr = deadline; | ||
668 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | ||
669 | } else if (dl_rq->earliest_dl.next == 0 || | ||
670 | dl_time_before(deadline, dl_rq->earliest_dl.next)) { | ||
671 | /* | ||
672 | * On the other hand, if the new -deadline task has a | ||
673 | * a later deadline than the earliest one on dl_rq, but | ||
674 | * it is earlier than the next (if any), we must | ||
675 | * recompute the next-earliest. | ||
676 | */ | ||
677 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
678 | } | ||
679 | } | ||
680 | |||
681 | static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
682 | { | ||
683 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
684 | |||
685 | /* | ||
686 | * Since we may have removed our earliest (and/or next earliest) | ||
687 | * task we must recompute them. | ||
688 | */ | ||
689 | if (!dl_rq->dl_nr_running) { | ||
690 | dl_rq->earliest_dl.curr = 0; | ||
691 | dl_rq->earliest_dl.next = 0; | ||
692 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
693 | } else { | ||
694 | struct rb_node *leftmost = dl_rq->rb_leftmost; | ||
695 | struct sched_dl_entity *entry; | ||
696 | |||
697 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | ||
698 | dl_rq->earliest_dl.curr = entry->deadline; | ||
699 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
700 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | ||
701 | } | ||
702 | } | ||
703 | |||
704 | #else | ||
705 | |||
706 | static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
707 | static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
708 | |||
709 | #endif /* CONFIG_SMP */ | ||
710 | |||
711 | static inline | ||
712 | void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
713 | { | ||
714 | int prio = dl_task_of(dl_se)->prio; | ||
715 | u64 deadline = dl_se->deadline; | ||
716 | |||
717 | WARN_ON(!dl_prio(prio)); | ||
718 | dl_rq->dl_nr_running++; | ||
719 | |||
720 | inc_dl_deadline(dl_rq, deadline); | ||
721 | inc_dl_migration(dl_se, dl_rq); | ||
722 | } | ||
723 | |||
724 | static inline | ||
725 | void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
726 | { | ||
727 | int prio = dl_task_of(dl_se)->prio; | ||
728 | |||
729 | WARN_ON(!dl_prio(prio)); | ||
730 | WARN_ON(!dl_rq->dl_nr_running); | ||
731 | dl_rq->dl_nr_running--; | ||
732 | |||
733 | dec_dl_deadline(dl_rq, dl_se->deadline); | ||
734 | dec_dl_migration(dl_se, dl_rq); | ||
735 | } | ||
736 | |||
737 | static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) | ||
738 | { | ||
739 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
740 | struct rb_node **link = &dl_rq->rb_root.rb_node; | ||
741 | struct rb_node *parent = NULL; | ||
742 | struct sched_dl_entity *entry; | ||
743 | int leftmost = 1; | ||
744 | |||
745 | BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); | ||
746 | |||
747 | while (*link) { | ||
748 | parent = *link; | ||
749 | entry = rb_entry(parent, struct sched_dl_entity, rb_node); | ||
750 | if (dl_time_before(dl_se->deadline, entry->deadline)) | ||
751 | link = &parent->rb_left; | ||
752 | else { | ||
753 | link = &parent->rb_right; | ||
754 | leftmost = 0; | ||
755 | } | ||
756 | } | ||
757 | |||
758 | if (leftmost) | ||
759 | dl_rq->rb_leftmost = &dl_se->rb_node; | ||
760 | |||
761 | rb_link_node(&dl_se->rb_node, parent, link); | ||
762 | rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); | ||
763 | |||
764 | inc_dl_tasks(dl_se, dl_rq); | ||
765 | } | ||
766 | |||
767 | static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
768 | { | ||
769 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
770 | |||
771 | if (RB_EMPTY_NODE(&dl_se->rb_node)) | ||
772 | return; | ||
773 | |||
774 | if (dl_rq->rb_leftmost == &dl_se->rb_node) { | ||
775 | struct rb_node *next_node; | ||
776 | |||
777 | next_node = rb_next(&dl_se->rb_node); | ||
778 | dl_rq->rb_leftmost = next_node; | ||
779 | } | ||
780 | |||
781 | rb_erase(&dl_se->rb_node, &dl_rq->rb_root); | ||
782 | RB_CLEAR_NODE(&dl_se->rb_node); | ||
783 | |||
784 | dec_dl_tasks(dl_se, dl_rq); | ||
785 | } | ||
786 | |||
787 | static void | ||
788 | enqueue_dl_entity(struct sched_dl_entity *dl_se, | ||
789 | struct sched_dl_entity *pi_se, int flags) | ||
790 | { | ||
791 | BUG_ON(on_dl_rq(dl_se)); | ||
792 | |||
793 | /* | ||
794 | * If this is a wakeup or a new instance, the scheduling | ||
795 | * parameters of the task might need updating. Otherwise, | ||
796 | * we want a replenishment of its runtime. | ||
797 | */ | ||
798 | if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) | ||
799 | replenish_dl_entity(dl_se, pi_se); | ||
800 | else | ||
801 | update_dl_entity(dl_se, pi_se); | ||
802 | |||
803 | __enqueue_dl_entity(dl_se); | ||
804 | } | ||
805 | |||
806 | static void dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
807 | { | ||
808 | __dequeue_dl_entity(dl_se); | ||
809 | } | ||
810 | |||
811 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
812 | { | ||
813 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | ||
814 | struct sched_dl_entity *pi_se = &p->dl; | ||
815 | |||
816 | /* | ||
817 | * Use the scheduling parameters of the top pi-waiter | ||
818 | * task if we have one and its (relative) deadline is | ||
819 | * smaller than our one... OTW we keep our runtime and | ||
820 | * deadline. | ||
821 | */ | ||
822 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | ||
823 | pi_se = &pi_task->dl; | ||
824 | |||
825 | /* | ||
826 | * If p is throttled, we do nothing. In fact, if it exhausted | ||
827 | * its budget it needs a replenishment and, since it now is on | ||
828 | * its rq, the bandwidth timer callback (which clearly has not | ||
829 | * run yet) will take care of this. | ||
830 | */ | ||
831 | if (p->dl.dl_throttled) | ||
832 | return; | ||
833 | |||
834 | enqueue_dl_entity(&p->dl, pi_se, flags); | ||
835 | |||
836 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | ||
837 | enqueue_pushable_dl_task(rq, p); | ||
838 | |||
839 | inc_nr_running(rq); | ||
840 | } | ||
841 | |||
842 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
843 | { | ||
844 | dequeue_dl_entity(&p->dl); | ||
845 | dequeue_pushable_dl_task(rq, p); | ||
846 | } | ||
847 | |||
848 | static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
849 | { | ||
850 | update_curr_dl(rq); | ||
851 | __dequeue_task_dl(rq, p, flags); | ||
852 | |||
853 | dec_nr_running(rq); | ||
854 | } | ||
855 | |||
856 | /* | ||
857 | * Yield task semantic for -deadline tasks is: | ||
858 | * | ||
859 | * get off from the CPU until our next instance, with | ||
860 | * a new runtime. This is of little use now, since we | ||
861 | * don't have a bandwidth reclaiming mechanism. Anyway, | ||
862 | * bandwidth reclaiming is planned for the future, and | ||
863 | * yield_task_dl will indicate that some spare budget | ||
864 | * is available for other task instances to use it. | ||
865 | */ | ||
866 | static void yield_task_dl(struct rq *rq) | ||
867 | { | ||
868 | struct task_struct *p = rq->curr; | ||
869 | |||
870 | /* | ||
871 | * We make the task go to sleep until its current deadline by | ||
872 | * forcing its runtime to zero. This way, update_curr_dl() stops | ||
873 | * it and the bandwidth timer will wake it up and will give it | ||
874 | * new scheduling parameters (thanks to dl_new=1). | ||
875 | */ | ||
876 | if (p->dl.runtime > 0) { | ||
877 | rq->curr->dl.dl_new = 1; | ||
878 | p->dl.runtime = 0; | ||
879 | } | ||
880 | update_curr_dl(rq); | ||
881 | } | ||
882 | |||
883 | #ifdef CONFIG_SMP | ||
884 | |||
885 | static int find_later_rq(struct task_struct *task); | ||
886 | |||
887 | static int | ||
888 | select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
889 | { | ||
890 | struct task_struct *curr; | ||
891 | struct rq *rq; | ||
892 | |||
893 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
894 | goto out; | ||
895 | |||
896 | rq = cpu_rq(cpu); | ||
897 | |||
898 | rcu_read_lock(); | ||
899 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
900 | |||
901 | /* | ||
902 | * If we are dealing with a -deadline task, we must | ||
903 | * decide where to wake it up. | ||
904 | * If it has a later deadline and the current task | ||
905 | * on this rq can't move (provided the waking task | ||
906 | * can!) we prefer to send it somewhere else. On the | ||
907 | * other hand, if it has a shorter deadline, we | ||
908 | * try to make it stay here, it might be important. | ||
909 | */ | ||
910 | if (unlikely(dl_task(curr)) && | ||
911 | (curr->nr_cpus_allowed < 2 || | ||
912 | !dl_entity_preempt(&p->dl, &curr->dl)) && | ||
913 | (p->nr_cpus_allowed > 1)) { | ||
914 | int target = find_later_rq(p); | ||
915 | |||
916 | if (target != -1) | ||
917 | cpu = target; | ||
918 | } | ||
919 | rcu_read_unlock(); | ||
920 | |||
921 | out: | ||
922 | return cpu; | ||
923 | } | ||
924 | |||
925 | static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | ||
926 | { | ||
927 | /* | ||
928 | * Current can't be migrated, useless to reschedule, | ||
929 | * let's hope p can move out. | ||
930 | */ | ||
931 | if (rq->curr->nr_cpus_allowed == 1 || | ||
932 | cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) | ||
933 | return; | ||
934 | |||
935 | /* | ||
936 | * p is migratable, so let's not schedule it and | ||
937 | * see if it is pushed or pulled somewhere else. | ||
938 | */ | ||
939 | if (p->nr_cpus_allowed != 1 && | ||
940 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) | ||
941 | return; | ||
942 | |||
943 | resched_task(rq->curr); | ||
944 | } | ||
945 | |||
946 | #endif /* CONFIG_SMP */ | ||
947 | |||
948 | /* | ||
949 | * Only called when both the current and waking task are -deadline | ||
950 | * tasks. | ||
951 | */ | ||
952 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
953 | int flags) | ||
954 | { | ||
955 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { | ||
956 | resched_task(rq->curr); | ||
957 | return; | ||
958 | } | ||
959 | |||
960 | #ifdef CONFIG_SMP | ||
961 | /* | ||
962 | * In the unlikely case current and p have the same deadline | ||
963 | * let us try to decide what's the best thing to do... | ||
964 | */ | ||
965 | if ((p->dl.deadline == rq->curr->dl.deadline) && | ||
966 | !test_tsk_need_resched(rq->curr)) | ||
967 | check_preempt_equal_dl(rq, p); | ||
968 | #endif /* CONFIG_SMP */ | ||
969 | } | ||
970 | |||
971 | #ifdef CONFIG_SCHED_HRTICK | ||
972 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | ||
973 | { | ||
974 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | ||
975 | |||
976 | if (delta > 10000) | ||
977 | hrtick_start(rq, p->dl.runtime); | ||
978 | } | ||
979 | #endif | ||
980 | |||
981 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | ||
982 | struct dl_rq *dl_rq) | ||
983 | { | ||
984 | struct rb_node *left = dl_rq->rb_leftmost; | ||
985 | |||
986 | if (!left) | ||
987 | return NULL; | ||
988 | |||
989 | return rb_entry(left, struct sched_dl_entity, rb_node); | ||
990 | } | ||
991 | |||
992 | struct task_struct *pick_next_task_dl(struct rq *rq) | ||
993 | { | ||
994 | struct sched_dl_entity *dl_se; | ||
995 | struct task_struct *p; | ||
996 | struct dl_rq *dl_rq; | ||
997 | |||
998 | dl_rq = &rq->dl; | ||
999 | |||
1000 | if (unlikely(!dl_rq->dl_nr_running)) | ||
1001 | return NULL; | ||
1002 | |||
1003 | dl_se = pick_next_dl_entity(rq, dl_rq); | ||
1004 | BUG_ON(!dl_se); | ||
1005 | |||
1006 | p = dl_task_of(dl_se); | ||
1007 | p->se.exec_start = rq_clock_task(rq); | ||
1008 | |||
1009 | /* Running task will never be pushed. */ | ||
1010 | dequeue_pushable_dl_task(rq, p); | ||
1011 | |||
1012 | #ifdef CONFIG_SCHED_HRTICK | ||
1013 | if (hrtick_enabled(rq)) | ||
1014 | start_hrtick_dl(rq, p); | ||
1015 | #endif | ||
1016 | |||
1017 | #ifdef CONFIG_SMP | ||
1018 | rq->post_schedule = has_pushable_dl_tasks(rq); | ||
1019 | #endif /* CONFIG_SMP */ | ||
1020 | |||
1021 | return p; | ||
1022 | } | ||
1023 | |||
1024 | static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | ||
1025 | { | ||
1026 | update_curr_dl(rq); | ||
1027 | |||
1028 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) | ||
1029 | enqueue_pushable_dl_task(rq, p); | ||
1030 | } | ||
1031 | |||
1032 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | ||
1033 | { | ||
1034 | update_curr_dl(rq); | ||
1035 | |||
1036 | #ifdef CONFIG_SCHED_HRTICK | ||
1037 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | ||
1038 | start_hrtick_dl(rq, p); | ||
1039 | #endif | ||
1040 | } | ||
1041 | |||
1042 | static void task_fork_dl(struct task_struct *p) | ||
1043 | { | ||
1044 | /* | ||
1045 | * SCHED_DEADLINE tasks cannot fork and this is achieved through | ||
1046 | * sched_fork() | ||
1047 | */ | ||
1048 | } | ||
1049 | |||
1050 | static void task_dead_dl(struct task_struct *p) | ||
1051 | { | ||
1052 | struct hrtimer *timer = &p->dl.dl_timer; | ||
1053 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
1054 | |||
1055 | /* | ||
1056 | * Since we are TASK_DEAD we won't slip out of the domain! | ||
1057 | */ | ||
1058 | raw_spin_lock_irq(&dl_b->lock); | ||
1059 | dl_b->total_bw -= p->dl.dl_bw; | ||
1060 | raw_spin_unlock_irq(&dl_b->lock); | ||
1061 | |||
1062 | hrtimer_cancel(timer); | ||
1063 | } | ||
1064 | |||
1065 | static void set_curr_task_dl(struct rq *rq) | ||
1066 | { | ||
1067 | struct task_struct *p = rq->curr; | ||
1068 | |||
1069 | p->se.exec_start = rq_clock_task(rq); | ||
1070 | |||
1071 | /* You can't push away the running task */ | ||
1072 | dequeue_pushable_dl_task(rq, p); | ||
1073 | } | ||
1074 | |||
1075 | #ifdef CONFIG_SMP | ||
1076 | |||
1077 | /* Only try algorithms three times */ | ||
1078 | #define DL_MAX_TRIES 3 | ||
1079 | |||
1080 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | ||
1081 | { | ||
1082 | if (!task_running(rq, p) && | ||
1083 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | ||
1084 | (p->nr_cpus_allowed > 1)) | ||
1085 | return 1; | ||
1086 | |||
1087 | return 0; | ||
1088 | } | ||
1089 | |||
1090 | /* Returns the second earliest -deadline task, NULL otherwise */ | ||
1091 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) | ||
1092 | { | ||
1093 | struct rb_node *next_node = rq->dl.rb_leftmost; | ||
1094 | struct sched_dl_entity *dl_se; | ||
1095 | struct task_struct *p = NULL; | ||
1096 | |||
1097 | next_node: | ||
1098 | next_node = rb_next(next_node); | ||
1099 | if (next_node) { | ||
1100 | dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); | ||
1101 | p = dl_task_of(dl_se); | ||
1102 | |||
1103 | if (pick_dl_task(rq, p, cpu)) | ||
1104 | return p; | ||
1105 | |||
1106 | goto next_node; | ||
1107 | } | ||
1108 | |||
1109 | return NULL; | ||
1110 | } | ||
1111 | |||
1112 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); | ||
1113 | |||
1114 | static int find_later_rq(struct task_struct *task) | ||
1115 | { | ||
1116 | struct sched_domain *sd; | ||
1117 | struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); | ||
1118 | int this_cpu = smp_processor_id(); | ||
1119 | int best_cpu, cpu = task_cpu(task); | ||
1120 | |||
1121 | /* Make sure the mask is initialized first */ | ||
1122 | if (unlikely(!later_mask)) | ||
1123 | return -1; | ||
1124 | |||
1125 | if (task->nr_cpus_allowed == 1) | ||
1126 | return -1; | ||
1127 | |||
1128 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | ||
1129 | task, later_mask); | ||
1130 | if (best_cpu == -1) | ||
1131 | return -1; | ||
1132 | |||
1133 | /* | ||
1134 | * If we are here, some target has been found, | ||
1135 | * the most suitable of which is cached in best_cpu. | ||
1136 | * This is, among the runqueues where the current tasks | ||
1137 | * have later deadlines than the task's one, the rq | ||
1138 | * with the latest possible one. | ||
1139 | * | ||
1140 | * Now we check how well this matches with task's | ||
1141 | * affinity and system topology. | ||
1142 | * | ||
1143 | * The last cpu where the task run is our first | ||
1144 | * guess, since it is most likely cache-hot there. | ||
1145 | */ | ||
1146 | if (cpumask_test_cpu(cpu, later_mask)) | ||
1147 | return cpu; | ||
1148 | /* | ||
1149 | * Check if this_cpu is to be skipped (i.e., it is | ||
1150 | * not in the mask) or not. | ||
1151 | */ | ||
1152 | if (!cpumask_test_cpu(this_cpu, later_mask)) | ||
1153 | this_cpu = -1; | ||
1154 | |||
1155 | rcu_read_lock(); | ||
1156 | for_each_domain(cpu, sd) { | ||
1157 | if (sd->flags & SD_WAKE_AFFINE) { | ||
1158 | |||
1159 | /* | ||
1160 | * If possible, preempting this_cpu is | ||
1161 | * cheaper than migrating. | ||
1162 | */ | ||
1163 | if (this_cpu != -1 && | ||
1164 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { | ||
1165 | rcu_read_unlock(); | ||
1166 | return this_cpu; | ||
1167 | } | ||
1168 | |||
1169 | /* | ||
1170 | * Last chance: if best_cpu is valid and is | ||
1171 | * in the mask, that becomes our choice. | ||
1172 | */ | ||
1173 | if (best_cpu < nr_cpu_ids && | ||
1174 | cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { | ||
1175 | rcu_read_unlock(); | ||
1176 | return best_cpu; | ||
1177 | } | ||
1178 | } | ||
1179 | } | ||
1180 | rcu_read_unlock(); | ||
1181 | |||
1182 | /* | ||
1183 | * At this point, all our guesses failed, we just return | ||
1184 | * 'something', and let the caller sort the things out. | ||
1185 | */ | ||
1186 | if (this_cpu != -1) | ||
1187 | return this_cpu; | ||
1188 | |||
1189 | cpu = cpumask_any(later_mask); | ||
1190 | if (cpu < nr_cpu_ids) | ||
1191 | return cpu; | ||
1192 | |||
1193 | return -1; | ||
1194 | } | ||
1195 | |||
1196 | /* Locks the rq it finds */ | ||
1197 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | ||
1198 | { | ||
1199 | struct rq *later_rq = NULL; | ||
1200 | int tries; | ||
1201 | int cpu; | ||
1202 | |||
1203 | for (tries = 0; tries < DL_MAX_TRIES; tries++) { | ||
1204 | cpu = find_later_rq(task); | ||
1205 | |||
1206 | if ((cpu == -1) || (cpu == rq->cpu)) | ||
1207 | break; | ||
1208 | |||
1209 | later_rq = cpu_rq(cpu); | ||
1210 | |||
1211 | /* Retry if something changed. */ | ||
1212 | if (double_lock_balance(rq, later_rq)) { | ||
1213 | if (unlikely(task_rq(task) != rq || | ||
1214 | !cpumask_test_cpu(later_rq->cpu, | ||
1215 | &task->cpus_allowed) || | ||
1216 | task_running(rq, task) || !task->on_rq)) { | ||
1217 | double_unlock_balance(rq, later_rq); | ||
1218 | later_rq = NULL; | ||
1219 | break; | ||
1220 | } | ||
1221 | } | ||
1222 | |||
1223 | /* | ||
1224 | * If the rq we found has no -deadline task, or | ||
1225 | * its earliest one has a later deadline than our | ||
1226 | * task, the rq is a good one. | ||
1227 | */ | ||
1228 | if (!later_rq->dl.dl_nr_running || | ||
1229 | dl_time_before(task->dl.deadline, | ||
1230 | later_rq->dl.earliest_dl.curr)) | ||
1231 | break; | ||
1232 | |||
1233 | /* Otherwise we try again. */ | ||
1234 | double_unlock_balance(rq, later_rq); | ||
1235 | later_rq = NULL; | ||
1236 | } | ||
1237 | |||
1238 | return later_rq; | ||
1239 | } | ||
1240 | |||
1241 | static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | ||
1242 | { | ||
1243 | struct task_struct *p; | ||
1244 | |||
1245 | if (!has_pushable_dl_tasks(rq)) | ||
1246 | return NULL; | ||
1247 | |||
1248 | p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, | ||
1249 | struct task_struct, pushable_dl_tasks); | ||
1250 | |||
1251 | BUG_ON(rq->cpu != task_cpu(p)); | ||
1252 | BUG_ON(task_current(rq, p)); | ||
1253 | BUG_ON(p->nr_cpus_allowed <= 1); | ||
1254 | |||
1255 | BUG_ON(!p->on_rq); | ||
1256 | BUG_ON(!dl_task(p)); | ||
1257 | |||
1258 | return p; | ||
1259 | } | ||
1260 | |||
1261 | /* | ||
1262 | * See if the non running -deadline tasks on this rq | ||
1263 | * can be sent to some other CPU where they can preempt | ||
1264 | * and start executing. | ||
1265 | */ | ||
1266 | static int push_dl_task(struct rq *rq) | ||
1267 | { | ||
1268 | struct task_struct *next_task; | ||
1269 | struct rq *later_rq; | ||
1270 | |||
1271 | if (!rq->dl.overloaded) | ||
1272 | return 0; | ||
1273 | |||
1274 | next_task = pick_next_pushable_dl_task(rq); | ||
1275 | if (!next_task) | ||
1276 | return 0; | ||
1277 | |||
1278 | retry: | ||
1279 | if (unlikely(next_task == rq->curr)) { | ||
1280 | WARN_ON(1); | ||
1281 | return 0; | ||
1282 | } | ||
1283 | |||
1284 | /* | ||
1285 | * If next_task preempts rq->curr, and rq->curr | ||
1286 | * can move away, it makes sense to just reschedule | ||
1287 | * without going further in pushing next_task. | ||
1288 | */ | ||
1289 | if (dl_task(rq->curr) && | ||
1290 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && | ||
1291 | rq->curr->nr_cpus_allowed > 1) { | ||
1292 | resched_task(rq->curr); | ||
1293 | return 0; | ||
1294 | } | ||
1295 | |||
1296 | /* We might release rq lock */ | ||
1297 | get_task_struct(next_task); | ||
1298 | |||
1299 | /* Will lock the rq it'll find */ | ||
1300 | later_rq = find_lock_later_rq(next_task, rq); | ||
1301 | if (!later_rq) { | ||
1302 | struct task_struct *task; | ||
1303 | |||
1304 | /* | ||
1305 | * We must check all this again, since | ||
1306 | * find_lock_later_rq releases rq->lock and it is | ||
1307 | * then possible that next_task has migrated. | ||
1308 | */ | ||
1309 | task = pick_next_pushable_dl_task(rq); | ||
1310 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | ||
1311 | /* | ||
1312 | * The task is still there. We don't try | ||
1313 | * again, some other cpu will pull it when ready. | ||
1314 | */ | ||
1315 | dequeue_pushable_dl_task(rq, next_task); | ||
1316 | goto out; | ||
1317 | } | ||
1318 | |||
1319 | if (!task) | ||
1320 | /* No more tasks */ | ||
1321 | goto out; | ||
1322 | |||
1323 | put_task_struct(next_task); | ||
1324 | next_task = task; | ||
1325 | goto retry; | ||
1326 | } | ||
1327 | |||
1328 | deactivate_task(rq, next_task, 0); | ||
1329 | set_task_cpu(next_task, later_rq->cpu); | ||
1330 | activate_task(later_rq, next_task, 0); | ||
1331 | |||
1332 | resched_task(later_rq->curr); | ||
1333 | |||
1334 | double_unlock_balance(rq, later_rq); | ||
1335 | |||
1336 | out: | ||
1337 | put_task_struct(next_task); | ||
1338 | |||
1339 | return 1; | ||
1340 | } | ||
1341 | |||
1342 | static void push_dl_tasks(struct rq *rq) | ||
1343 | { | ||
1344 | /* Terminates as it moves a -deadline task */ | ||
1345 | while (push_dl_task(rq)) | ||
1346 | ; | ||
1347 | } | ||
1348 | |||
1349 | static int pull_dl_task(struct rq *this_rq) | ||
1350 | { | ||
1351 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
1352 | struct task_struct *p; | ||
1353 | struct rq *src_rq; | ||
1354 | u64 dmin = LONG_MAX; | ||
1355 | |||
1356 | if (likely(!dl_overloaded(this_rq))) | ||
1357 | return 0; | ||
1358 | |||
1359 | /* | ||
1360 | * Match the barrier from dl_set_overloaded; this guarantees that if we | ||
1361 | * see overloaded we must also see the dlo_mask bit. | ||
1362 | */ | ||
1363 | smp_rmb(); | ||
1364 | |||
1365 | for_each_cpu(cpu, this_rq->rd->dlo_mask) { | ||
1366 | if (this_cpu == cpu) | ||
1367 | continue; | ||
1368 | |||
1369 | src_rq = cpu_rq(cpu); | ||
1370 | |||
1371 | /* | ||
1372 | * It looks racy, abd it is! However, as in sched_rt.c, | ||
1373 | * we are fine with this. | ||
1374 | */ | ||
1375 | if (this_rq->dl.dl_nr_running && | ||
1376 | dl_time_before(this_rq->dl.earliest_dl.curr, | ||
1377 | src_rq->dl.earliest_dl.next)) | ||
1378 | continue; | ||
1379 | |||
1380 | /* Might drop this_rq->lock */ | ||
1381 | double_lock_balance(this_rq, src_rq); | ||
1382 | |||
1383 | /* | ||
1384 | * If there are no more pullable tasks on the | ||
1385 | * rq, we're done with it. | ||
1386 | */ | ||
1387 | if (src_rq->dl.dl_nr_running <= 1) | ||
1388 | goto skip; | ||
1389 | |||
1390 | p = pick_next_earliest_dl_task(src_rq, this_cpu); | ||
1391 | |||
1392 | /* | ||
1393 | * We found a task to be pulled if: | ||
1394 | * - it preempts our current (if there's one), | ||
1395 | * - it will preempt the last one we pulled (if any). | ||
1396 | */ | ||
1397 | if (p && dl_time_before(p->dl.deadline, dmin) && | ||
1398 | (!this_rq->dl.dl_nr_running || | ||
1399 | dl_time_before(p->dl.deadline, | ||
1400 | this_rq->dl.earliest_dl.curr))) { | ||
1401 | WARN_ON(p == src_rq->curr); | ||
1402 | WARN_ON(!p->on_rq); | ||
1403 | |||
1404 | /* | ||
1405 | * Then we pull iff p has actually an earlier | ||
1406 | * deadline than the current task of its runqueue. | ||
1407 | */ | ||
1408 | if (dl_time_before(p->dl.deadline, | ||
1409 | src_rq->curr->dl.deadline)) | ||
1410 | goto skip; | ||
1411 | |||
1412 | ret = 1; | ||
1413 | |||
1414 | deactivate_task(src_rq, p, 0); | ||
1415 | set_task_cpu(p, this_cpu); | ||
1416 | activate_task(this_rq, p, 0); | ||
1417 | dmin = p->dl.deadline; | ||
1418 | |||
1419 | /* Is there any other task even earlier? */ | ||
1420 | } | ||
1421 | skip: | ||
1422 | double_unlock_balance(this_rq, src_rq); | ||
1423 | } | ||
1424 | |||
1425 | return ret; | ||
1426 | } | ||
1427 | |||
1428 | static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) | ||
1429 | { | ||
1430 | /* Try to pull other tasks here */ | ||
1431 | if (dl_task(prev)) | ||
1432 | pull_dl_task(rq); | ||
1433 | } | ||
1434 | |||
1435 | static void post_schedule_dl(struct rq *rq) | ||
1436 | { | ||
1437 | push_dl_tasks(rq); | ||
1438 | } | ||
1439 | |||
1440 | /* | ||
1441 | * Since the task is not running and a reschedule is not going to happen | ||
1442 | * anytime soon on its runqueue, we try pushing it away now. | ||
1443 | */ | ||
1444 | static void task_woken_dl(struct rq *rq, struct task_struct *p) | ||
1445 | { | ||
1446 | if (!task_running(rq, p) && | ||
1447 | !test_tsk_need_resched(rq->curr) && | ||
1448 | has_pushable_dl_tasks(rq) && | ||
1449 | p->nr_cpus_allowed > 1 && | ||
1450 | dl_task(rq->curr) && | ||
1451 | (rq->curr->nr_cpus_allowed < 2 || | ||
1452 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | ||
1453 | push_dl_tasks(rq); | ||
1454 | } | ||
1455 | } | ||
1456 | |||
1457 | static void set_cpus_allowed_dl(struct task_struct *p, | ||
1458 | const struct cpumask *new_mask) | ||
1459 | { | ||
1460 | struct rq *rq; | ||
1461 | int weight; | ||
1462 | |||
1463 | BUG_ON(!dl_task(p)); | ||
1464 | |||
1465 | /* | ||
1466 | * Update only if the task is actually running (i.e., | ||
1467 | * it is on the rq AND it is not throttled). | ||
1468 | */ | ||
1469 | if (!on_dl_rq(&p->dl)) | ||
1470 | return; | ||
1471 | |||
1472 | weight = cpumask_weight(new_mask); | ||
1473 | |||
1474 | /* | ||
1475 | * Only update if the process changes its state from whether it | ||
1476 | * can migrate or not. | ||
1477 | */ | ||
1478 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | ||
1479 | return; | ||
1480 | |||
1481 | rq = task_rq(p); | ||
1482 | |||
1483 | /* | ||
1484 | * The process used to be able to migrate OR it can now migrate | ||
1485 | */ | ||
1486 | if (weight <= 1) { | ||
1487 | if (!task_current(rq, p)) | ||
1488 | dequeue_pushable_dl_task(rq, p); | ||
1489 | BUG_ON(!rq->dl.dl_nr_migratory); | ||
1490 | rq->dl.dl_nr_migratory--; | ||
1491 | } else { | ||
1492 | if (!task_current(rq, p)) | ||
1493 | enqueue_pushable_dl_task(rq, p); | ||
1494 | rq->dl.dl_nr_migratory++; | ||
1495 | } | ||
1496 | |||
1497 | update_dl_migration(&rq->dl); | ||
1498 | } | ||
1499 | |||
1500 | /* Assumes rq->lock is held */ | ||
1501 | static void rq_online_dl(struct rq *rq) | ||
1502 | { | ||
1503 | if (rq->dl.overloaded) | ||
1504 | dl_set_overload(rq); | ||
1505 | |||
1506 | if (rq->dl.dl_nr_running > 0) | ||
1507 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | ||
1508 | } | ||
1509 | |||
1510 | /* Assumes rq->lock is held */ | ||
1511 | static void rq_offline_dl(struct rq *rq) | ||
1512 | { | ||
1513 | if (rq->dl.overloaded) | ||
1514 | dl_clear_overload(rq); | ||
1515 | |||
1516 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
1517 | } | ||
1518 | |||
1519 | void init_sched_dl_class(void) | ||
1520 | { | ||
1521 | unsigned int i; | ||
1522 | |||
1523 | for_each_possible_cpu(i) | ||
1524 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), | ||
1525 | GFP_KERNEL, cpu_to_node(i)); | ||
1526 | } | ||
1527 | |||
1528 | #endif /* CONFIG_SMP */ | ||
1529 | |||
1530 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | ||
1531 | { | ||
1532 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | ||
1533 | hrtimer_try_to_cancel(&p->dl.dl_timer); | ||
1534 | |||
1535 | #ifdef CONFIG_SMP | ||
1536 | /* | ||
1537 | * Since this might be the only -deadline task on the rq, | ||
1538 | * this is the right place to try to pull some other one | ||
1539 | * from an overloaded cpu, if any. | ||
1540 | */ | ||
1541 | if (!rq->dl.dl_nr_running) | ||
1542 | pull_dl_task(rq); | ||
1543 | #endif | ||
1544 | } | ||
1545 | |||
1546 | /* | ||
1547 | * When switching to -deadline, we may overload the rq, then | ||
1548 | * we try to push someone off, if possible. | ||
1549 | */ | ||
1550 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | ||
1551 | { | ||
1552 | int check_resched = 1; | ||
1553 | |||
1554 | /* | ||
1555 | * If p is throttled, don't consider the possibility | ||
1556 | * of preempting rq->curr, the check will be done right | ||
1557 | * after its runtime will get replenished. | ||
1558 | */ | ||
1559 | if (unlikely(p->dl.dl_throttled)) | ||
1560 | return; | ||
1561 | |||
1562 | if (p->on_rq || rq->curr != p) { | ||
1563 | #ifdef CONFIG_SMP | ||
1564 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | ||
1565 | /* Only reschedule if pushing failed */ | ||
1566 | check_resched = 0; | ||
1567 | #endif /* CONFIG_SMP */ | ||
1568 | if (check_resched && task_has_dl_policy(rq->curr)) | ||
1569 | check_preempt_curr_dl(rq, p, 0); | ||
1570 | } | ||
1571 | } | ||
1572 | |||
1573 | /* | ||
1574 | * If the scheduling parameters of a -deadline task changed, | ||
1575 | * a push or pull operation might be needed. | ||
1576 | */ | ||
1577 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | ||
1578 | int oldprio) | ||
1579 | { | ||
1580 | if (p->on_rq || rq->curr == p) { | ||
1581 | #ifdef CONFIG_SMP | ||
1582 | /* | ||
1583 | * This might be too much, but unfortunately | ||
1584 | * we don't have the old deadline value, and | ||
1585 | * we can't argue if the task is increasing | ||
1586 | * or lowering its prio, so... | ||
1587 | */ | ||
1588 | if (!rq->dl.overloaded) | ||
1589 | pull_dl_task(rq); | ||
1590 | |||
1591 | /* | ||
1592 | * If we now have a earlier deadline task than p, | ||
1593 | * then reschedule, provided p is still on this | ||
1594 | * runqueue. | ||
1595 | */ | ||
1596 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && | ||
1597 | rq->curr == p) | ||
1598 | resched_task(p); | ||
1599 | #else | ||
1600 | /* | ||
1601 | * Again, we don't know if p has a earlier | ||
1602 | * or later deadline, so let's blindly set a | ||
1603 | * (maybe not needed) rescheduling point. | ||
1604 | */ | ||
1605 | resched_task(p); | ||
1606 | #endif /* CONFIG_SMP */ | ||
1607 | } else | ||
1608 | switched_to_dl(rq, p); | ||
1609 | } | ||
1610 | |||
1611 | const struct sched_class dl_sched_class = { | ||
1612 | .next = &rt_sched_class, | ||
1613 | .enqueue_task = enqueue_task_dl, | ||
1614 | .dequeue_task = dequeue_task_dl, | ||
1615 | .yield_task = yield_task_dl, | ||
1616 | |||
1617 | .check_preempt_curr = check_preempt_curr_dl, | ||
1618 | |||
1619 | .pick_next_task = pick_next_task_dl, | ||
1620 | .put_prev_task = put_prev_task_dl, | ||
1621 | |||
1622 | #ifdef CONFIG_SMP | ||
1623 | .select_task_rq = select_task_rq_dl, | ||
1624 | .set_cpus_allowed = set_cpus_allowed_dl, | ||
1625 | .rq_online = rq_online_dl, | ||
1626 | .rq_offline = rq_offline_dl, | ||
1627 | .pre_schedule = pre_schedule_dl, | ||
1628 | .post_schedule = post_schedule_dl, | ||
1629 | .task_woken = task_woken_dl, | ||
1630 | #endif | ||
1631 | |||
1632 | .set_curr_task = set_curr_task_dl, | ||
1633 | .task_tick = task_tick_dl, | ||
1634 | .task_fork = task_fork_dl, | ||
1635 | .task_dead = task_dead_dl, | ||
1636 | |||
1637 | .prio_changed = prio_changed_dl, | ||
1638 | .switched_from = switched_from_dl, | ||
1639 | .switched_to = switched_to_dl, | ||
1640 | }; | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
140 | #endif | 140 | #endif |
141 | #ifdef CONFIG_NUMA_BALANCING | 141 | #ifdef CONFIG_NUMA_BALANCING |
142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | 142 | SEQ_printf(m, " %d", task_node(p)); |
143 | #endif | 143 | #endif |
144 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) | |||
371 | PN(cpu_clk); | 371 | PN(cpu_clk); |
372 | P(jiffies); | 372 | P(jiffies); |
373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
374 | P(sched_clock_stable); | 374 | P(sched_clock_stable()); |
375 | #endif | 375 | #endif |
376 | #undef PN | 376 | #undef PN |
377 | #undef P | 377 | #undef P |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e64b0794060e..b24b6cfde9aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
872 | return max(smin, smax); | 872 | return max(smin, smax); |
873 | } | 873 | } |
874 | 874 | ||
875 | /* | ||
876 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
877 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
878 | * scans. This will give the process the chance to accumulate more faults on | ||
879 | * the preferred node but still allow the scheduler to move the task again if | ||
880 | * the nodes CPUs are overloaded. | ||
881 | */ | ||
882 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
883 | |||
884 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 875 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
885 | { | 876 | { |
886 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 877 | rq->nr_numa_running += (p->numa_preferred_nid != -1); |
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
930 | if (!p->numa_group) | 921 | if (!p->numa_group) |
931 | return 0; | 922 | return 0; |
932 | 923 | ||
933 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | 924 | return p->numa_group->faults[task_faults_idx(nid, 0)] + |
925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | ||
934 | } | 926 | } |
935 | 927 | ||
936 | /* | 928 | /* |
@@ -1023,7 +1015,7 @@ struct task_numa_env { | |||
1023 | 1015 | ||
1024 | struct numa_stats src_stats, dst_stats; | 1016 | struct numa_stats src_stats, dst_stats; |
1025 | 1017 | ||
1026 | int imbalance_pct, idx; | 1018 | int imbalance_pct; |
1027 | 1019 | ||
1028 | struct task_struct *best_task; | 1020 | struct task_struct *best_task; |
1029 | long best_imp; | 1021 | long best_imp; |
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1211 | * elsewhere, so there is no point in (re)trying. | 1203 | * elsewhere, so there is no point in (re)trying. |
1212 | */ | 1204 | */ |
1213 | if (unlikely(!sd)) { | 1205 | if (unlikely(!sd)) { |
1214 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | 1206 | p->numa_preferred_nid = task_node(p); |
1215 | return -EINVAL; | 1207 | return -EINVAL; |
1216 | } | 1208 | } |
1217 | 1209 | ||
@@ -1278,7 +1270,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1278 | p->numa_migrate_retry = jiffies + HZ; | 1270 | p->numa_migrate_retry = jiffies + HZ; |
1279 | 1271 | ||
1280 | /* Success if task is already running on preferred CPU */ | 1272 | /* Success if task is already running on preferred CPU */ |
1281 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | 1273 | if (task_node(p) == p->numa_preferred_nid) |
1282 | return; | 1274 | return; |
1283 | 1275 | ||
1284 | /* Otherwise, try migrate to a CPU on the preferred node */ | 1276 | /* Otherwise, try migrate to a CPU on the preferred node */ |
@@ -1350,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p, | |||
1350 | * scanning faster if shared accesses dominate as it may | 1342 | * scanning faster if shared accesses dominate as it may |
1351 | * simply bounce migrations uselessly | 1343 | * simply bounce migrations uselessly |
1352 | */ | 1344 | */ |
1353 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
1354 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1345 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); |
1355 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1346 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
1356 | } | 1347 | } |
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4101 | */ | 4092 | */ |
4102 | static struct sched_group * | 4093 | static struct sched_group * |
4103 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 4094 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
4104 | int this_cpu, int load_idx) | 4095 | int this_cpu, int sd_flag) |
4105 | { | 4096 | { |
4106 | struct sched_group *idlest = NULL, *group = sd->groups; | 4097 | struct sched_group *idlest = NULL, *group = sd->groups; |
4107 | unsigned long min_load = ULONG_MAX, this_load = 0; | 4098 | unsigned long min_load = ULONG_MAX, this_load = 0; |
4099 | int load_idx = sd->forkexec_idx; | ||
4108 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 4100 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
4109 | 4101 | ||
4102 | if (sd_flag & SD_BALANCE_WAKE) | ||
4103 | load_idx = sd->wake_idx; | ||
4104 | |||
4110 | do { | 4105 | do { |
4111 | unsigned long load, avg_load; | 4106 | unsigned long load, avg_load; |
4112 | int local_group; | 4107 | int local_group; |
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4274 | } | 4269 | } |
4275 | 4270 | ||
4276 | while (sd) { | 4271 | while (sd) { |
4277 | int load_idx = sd->forkexec_idx; | ||
4278 | struct sched_group *group; | 4272 | struct sched_group *group; |
4279 | int weight; | 4273 | int weight; |
4280 | 4274 | ||
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4283 | continue; | 4277 | continue; |
4284 | } | 4278 | } |
4285 | 4279 | ||
4286 | if (sd_flag & SD_BALANCE_WAKE) | 4280 | group = find_idlest_group(sd, p, cpu, sd_flag); |
4287 | load_idx = sd->wake_idx; | ||
4288 | |||
4289 | group = find_idlest_group(sd, p, cpu, load_idx); | ||
4290 | if (!group) { | 4281 | if (!group) { |
4291 | sd = sd->child; | 4282 | sd = sd->child; |
4292 | continue; | 4283 | continue; |
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5512 | struct sched_group *group, int load_idx, | 5503 | struct sched_group *group, int load_idx, |
5513 | int local_group, struct sg_lb_stats *sgs) | 5504 | int local_group, struct sg_lb_stats *sgs) |
5514 | { | 5505 | { |
5515 | unsigned long nr_running; | ||
5516 | unsigned long load; | 5506 | unsigned long load; |
5517 | int i; | 5507 | int i; |
5518 | 5508 | ||
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5521 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5511 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5522 | struct rq *rq = cpu_rq(i); | 5512 | struct rq *rq = cpu_rq(i); |
5523 | 5513 | ||
5524 | nr_running = rq->nr_running; | ||
5525 | |||
5526 | /* Bias balancing toward cpus of our domain */ | 5514 | /* Bias balancing toward cpus of our domain */ |
5527 | if (local_group) | 5515 | if (local_group) |
5528 | load = target_load(i, load_idx); | 5516 | load = target_load(i, load_idx); |
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5530 | load = source_load(i, load_idx); | 5518 | load = source_load(i, load_idx); |
5531 | 5519 | ||
5532 | sgs->group_load += load; | 5520 | sgs->group_load += load; |
5533 | sgs->sum_nr_running += nr_running; | 5521 | sgs->sum_nr_running += rq->nr_running; |
5534 | #ifdef CONFIG_NUMA_BALANCING | 5522 | #ifdef CONFIG_NUMA_BALANCING |
5535 | sgs->nr_numa_running += rq->nr_numa_running; | 5523 | sgs->nr_numa_running += rq->nr_numa_running; |
5536 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5524 | sgs->nr_preferred_running += rq->nr_preferred_running; |
@@ -6521,7 +6509,7 @@ static struct { | |||
6521 | unsigned long next_balance; /* in jiffy units */ | 6509 | unsigned long next_balance; /* in jiffy units */ |
6522 | } nohz ____cacheline_aligned; | 6510 | } nohz ____cacheline_aligned; |
6523 | 6511 | ||
6524 | static inline int find_new_ilb(int call_cpu) | 6512 | static inline int find_new_ilb(void) |
6525 | { | 6513 | { |
6526 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 6514 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
6527 | 6515 | ||
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu) | |||
6536 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | 6524 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle |
6537 | * CPU (if there is one). | 6525 | * CPU (if there is one). |
6538 | */ | 6526 | */ |
6539 | static void nohz_balancer_kick(int cpu) | 6527 | static void nohz_balancer_kick(void) |
6540 | { | 6528 | { |
6541 | int ilb_cpu; | 6529 | int ilb_cpu; |
6542 | 6530 | ||
6543 | nohz.next_balance++; | 6531 | nohz.next_balance++; |
6544 | 6532 | ||
6545 | ilb_cpu = find_new_ilb(cpu); | 6533 | ilb_cpu = find_new_ilb(); |
6546 | 6534 | ||
6547 | if (ilb_cpu >= nr_cpu_ids) | 6535 | if (ilb_cpu >= nr_cpu_ids) |
6548 | return; | 6536 | return; |
@@ -6652,10 +6640,10 @@ void update_max_interval(void) | |||
6652 | * | 6640 | * |
6653 | * Balancing parameters are set up in init_sched_domains. | 6641 | * Balancing parameters are set up in init_sched_domains. |
6654 | */ | 6642 | */ |
6655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 6643 | static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) |
6656 | { | 6644 | { |
6657 | int continue_balancing = 1; | 6645 | int continue_balancing = 1; |
6658 | struct rq *rq = cpu_rq(cpu); | 6646 | int cpu = rq->cpu; |
6659 | unsigned long interval; | 6647 | unsigned long interval; |
6660 | struct sched_domain *sd; | 6648 | struct sched_domain *sd; |
6661 | /* Earliest time when we have to do rebalance again */ | 6649 | /* Earliest time when we have to do rebalance again */ |
@@ -6752,9 +6740,9 @@ out: | |||
6752 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 6740 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
6753 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 6741 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
6754 | */ | 6742 | */ |
6755 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 6743 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) |
6756 | { | 6744 | { |
6757 | struct rq *this_rq = cpu_rq(this_cpu); | 6745 | int this_cpu = this_rq->cpu; |
6758 | struct rq *rq; | 6746 | struct rq *rq; |
6759 | int balance_cpu; | 6747 | int balance_cpu; |
6760 | 6748 | ||
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
6781 | update_idle_cpu_load(rq); | 6769 | update_idle_cpu_load(rq); |
6782 | raw_spin_unlock_irq(&rq->lock); | 6770 | raw_spin_unlock_irq(&rq->lock); |
6783 | 6771 | ||
6784 | rebalance_domains(balance_cpu, CPU_IDLE); | 6772 | rebalance_domains(rq, CPU_IDLE); |
6785 | 6773 | ||
6786 | if (time_after(this_rq->next_balance, rq->next_balance)) | 6774 | if (time_after(this_rq->next_balance, rq->next_balance)) |
6787 | this_rq->next_balance = rq->next_balance; | 6775 | this_rq->next_balance = rq->next_balance; |
@@ -6800,14 +6788,14 @@ end: | |||
6800 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 6788 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
6801 | * domain span are idle. | 6789 | * domain span are idle. |
6802 | */ | 6790 | */ |
6803 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 6791 | static inline int nohz_kick_needed(struct rq *rq) |
6804 | { | 6792 | { |
6805 | unsigned long now = jiffies; | 6793 | unsigned long now = jiffies; |
6806 | struct sched_domain *sd; | 6794 | struct sched_domain *sd; |
6807 | struct sched_group_power *sgp; | 6795 | struct sched_group_power *sgp; |
6808 | int nr_busy; | 6796 | int nr_busy, cpu = rq->cpu; |
6809 | 6797 | ||
6810 | if (unlikely(idle_cpu(cpu))) | 6798 | if (unlikely(rq->idle_balance)) |
6811 | return 0; | 6799 | return 0; |
6812 | 6800 | ||
6813 | /* | 6801 | /* |
@@ -6856,7 +6844,7 @@ need_kick: | |||
6856 | return 1; | 6844 | return 1; |
6857 | } | 6845 | } |
6858 | #else | 6846 | #else |
6859 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 6847 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
6860 | #endif | 6848 | #endif |
6861 | 6849 | ||
6862 | /* | 6850 | /* |
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | |||
6865 | */ | 6853 | */ |
6866 | static void run_rebalance_domains(struct softirq_action *h) | 6854 | static void run_rebalance_domains(struct softirq_action *h) |
6867 | { | 6855 | { |
6868 | int this_cpu = smp_processor_id(); | 6856 | struct rq *this_rq = this_rq(); |
6869 | struct rq *this_rq = cpu_rq(this_cpu); | ||
6870 | enum cpu_idle_type idle = this_rq->idle_balance ? | 6857 | enum cpu_idle_type idle = this_rq->idle_balance ? |
6871 | CPU_IDLE : CPU_NOT_IDLE; | 6858 | CPU_IDLE : CPU_NOT_IDLE; |
6872 | 6859 | ||
6873 | rebalance_domains(this_cpu, idle); | 6860 | rebalance_domains(this_rq, idle); |
6874 | 6861 | ||
6875 | /* | 6862 | /* |
6876 | * If this cpu has a pending nohz_balance_kick, then do the | 6863 | * If this cpu has a pending nohz_balance_kick, then do the |
6877 | * balancing on behalf of the other idle cpus whose ticks are | 6864 | * balancing on behalf of the other idle cpus whose ticks are |
6878 | * stopped. | 6865 | * stopped. |
6879 | */ | 6866 | */ |
6880 | nohz_idle_balance(this_cpu, idle); | 6867 | nohz_idle_balance(this_rq, idle); |
6881 | } | 6868 | } |
6882 | 6869 | ||
6883 | static inline int on_null_domain(int cpu) | 6870 | static inline int on_null_domain(struct rq *rq) |
6884 | { | 6871 | { |
6885 | return !rcu_dereference_sched(cpu_rq(cpu)->sd); | 6872 | return !rcu_dereference_sched(rq->sd); |
6886 | } | 6873 | } |
6887 | 6874 | ||
6888 | /* | 6875 | /* |
6889 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 6876 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
6890 | */ | 6877 | */ |
6891 | void trigger_load_balance(struct rq *rq, int cpu) | 6878 | void trigger_load_balance(struct rq *rq) |
6892 | { | 6879 | { |
6893 | /* Don't need to rebalance while attached to NULL domain */ | 6880 | /* Don't need to rebalance while attached to NULL domain */ |
6894 | if (time_after_eq(jiffies, rq->next_balance) && | 6881 | if (unlikely(on_null_domain(rq))) |
6895 | likely(!on_null_domain(cpu))) | 6882 | return; |
6883 | |||
6884 | if (time_after_eq(jiffies, rq->next_balance)) | ||
6896 | raise_softirq(SCHED_SOFTIRQ); | 6885 | raise_softirq(SCHED_SOFTIRQ); |
6897 | #ifdef CONFIG_NO_HZ_COMMON | 6886 | #ifdef CONFIG_NO_HZ_COMMON |
6898 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 6887 | if (nohz_kick_needed(rq)) |
6899 | nohz_balancer_kick(cpu); | 6888 | nohz_balancer_kick(); |
6900 | #endif | 6889 | #endif |
6901 | } | 6890 | } |
6902 | 6891 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..a2740b775b45 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1738 | !test_tsk_need_resched(rq->curr) && | 1738 | !test_tsk_need_resched(rq->curr) && |
1739 | has_pushable_tasks(rq) && | 1739 | has_pushable_tasks(rq) && |
1740 | p->nr_cpus_allowed > 1 && | 1740 | p->nr_cpus_allowed > 1 && |
1741 | rt_task(rq->curr) && | 1741 | (dl_task(rq->curr) || rt_task(rq->curr)) && |
1742 | (rq->curr->nr_cpus_allowed < 2 || | 1742 | (rq->curr->nr_cpus_allowed < 2 || |
1743 | rq->curr->prio <= p->prio)) | 1743 | rq->curr->prio <= p->prio)) |
1744 | push_rt_tasks(rq); | 1744 | push_rt_tasks(rq); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..c2119fd20f8b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/sched/deadline.h> | ||
5 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
6 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
@@ -9,6 +10,7 @@ | |||
9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
10 | 11 | ||
11 | #include "cpupri.h" | 12 | #include "cpupri.h" |
13 | #include "cpudeadline.h" | ||
12 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
13 | 15 | ||
14 | struct rq; | 16 | struct rq; |
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
73 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 75 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
74 | 76 | ||
75 | /* | 77 | /* |
78 | * Single value that decides SCHED_DEADLINE internal math precision. | ||
79 | * 10 -> just above 1us | ||
80 | * 9 -> just above 0.5us | ||
81 | */ | ||
82 | #define DL_SCALE (10) | ||
83 | |||
84 | /* | ||
76 | * These are the 'tuning knobs' of the scheduler: | 85 | * These are the 'tuning knobs' of the scheduler: |
77 | */ | 86 | */ |
78 | 87 | ||
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
81 | */ | 90 | */ |
82 | #define RUNTIME_INF ((u64)~0ULL) | 91 | #define RUNTIME_INF ((u64)~0ULL) |
83 | 92 | ||
93 | static inline int fair_policy(int policy) | ||
94 | { | ||
95 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | ||
96 | } | ||
97 | |||
84 | static inline int rt_policy(int policy) | 98 | static inline int rt_policy(int policy) |
85 | { | 99 | { |
86 | if (policy == SCHED_FIFO || policy == SCHED_RR) | 100 | return policy == SCHED_FIFO || policy == SCHED_RR; |
87 | return 1; | 101 | } |
88 | return 0; | 102 | |
103 | static inline int dl_policy(int policy) | ||
104 | { | ||
105 | return policy == SCHED_DEADLINE; | ||
89 | } | 106 | } |
90 | 107 | ||
91 | static inline int task_has_rt_policy(struct task_struct *p) | 108 | static inline int task_has_rt_policy(struct task_struct *p) |
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p) | |||
93 | return rt_policy(p->policy); | 110 | return rt_policy(p->policy); |
94 | } | 111 | } |
95 | 112 | ||
113 | static inline int task_has_dl_policy(struct task_struct *p) | ||
114 | { | ||
115 | return dl_policy(p->policy); | ||
116 | } | ||
117 | |||
118 | static inline bool dl_time_before(u64 a, u64 b) | ||
119 | { | ||
120 | return (s64)(a - b) < 0; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Tells if entity @a should preempt entity @b. | ||
125 | */ | ||
126 | static inline bool | ||
127 | dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) | ||
128 | { | ||
129 | return dl_time_before(a->deadline, b->deadline); | ||
130 | } | ||
131 | |||
96 | /* | 132 | /* |
97 | * This is the priority-queue data structure of the RT scheduling class: | 133 | * This is the priority-queue data structure of the RT scheduling class: |
98 | */ | 134 | */ |
@@ -108,6 +144,47 @@ struct rt_bandwidth { | |||
108 | u64 rt_runtime; | 144 | u64 rt_runtime; |
109 | struct hrtimer rt_period_timer; | 145 | struct hrtimer rt_period_timer; |
110 | }; | 146 | }; |
147 | /* | ||
148 | * To keep the bandwidth of -deadline tasks and groups under control | ||
149 | * we need some place where: | ||
150 | * - store the maximum -deadline bandwidth of the system (the group); | ||
151 | * - cache the fraction of that bandwidth that is currently allocated. | ||
152 | * | ||
153 | * This is all done in the data structure below. It is similar to the | ||
154 | * one used for RT-throttling (rt_bandwidth), with the main difference | ||
155 | * that, since here we are only interested in admission control, we | ||
156 | * do not decrease any runtime while the group "executes", neither we | ||
157 | * need a timer to replenish it. | ||
158 | * | ||
159 | * With respect to SMP, the bandwidth is given on a per-CPU basis, | ||
160 | * meaning that: | ||
161 | * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; | ||
162 | * - dl_total_bw array contains, in the i-eth element, the currently | ||
163 | * allocated bandwidth on the i-eth CPU. | ||
164 | * Moreover, groups consume bandwidth on each CPU, while tasks only | ||
165 | * consume bandwidth on the CPU they're running on. | ||
166 | * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw | ||
167 | * that will be shown the next time the proc or cgroup controls will | ||
168 | * be red. It on its turn can be changed by writing on its own | ||
169 | * control. | ||
170 | */ | ||
171 | struct dl_bandwidth { | ||
172 | raw_spinlock_t dl_runtime_lock; | ||
173 | u64 dl_runtime; | ||
174 | u64 dl_period; | ||
175 | }; | ||
176 | |||
177 | static inline int dl_bandwidth_enabled(void) | ||
178 | { | ||
179 | return sysctl_sched_rt_runtime >= 0; | ||
180 | } | ||
181 | |||
182 | extern struct dl_bw *dl_bw_of(int i); | ||
183 | |||
184 | struct dl_bw { | ||
185 | raw_spinlock_t lock; | ||
186 | u64 bw, total_bw; | ||
187 | }; | ||
111 | 188 | ||
112 | extern struct mutex sched_domains_mutex; | 189 | extern struct mutex sched_domains_mutex; |
113 | 190 | ||
@@ -364,6 +441,42 @@ struct rt_rq { | |||
364 | #endif | 441 | #endif |
365 | }; | 442 | }; |
366 | 443 | ||
444 | /* Deadline class' related fields in a runqueue */ | ||
445 | struct dl_rq { | ||
446 | /* runqueue is an rbtree, ordered by deadline */ | ||
447 | struct rb_root rb_root; | ||
448 | struct rb_node *rb_leftmost; | ||
449 | |||
450 | unsigned long dl_nr_running; | ||
451 | |||
452 | #ifdef CONFIG_SMP | ||
453 | /* | ||
454 | * Deadline values of the currently executing and the | ||
455 | * earliest ready task on this rq. Caching these facilitates | ||
456 | * the decision wether or not a ready but not running task | ||
457 | * should migrate somewhere else. | ||
458 | */ | ||
459 | struct { | ||
460 | u64 curr; | ||
461 | u64 next; | ||
462 | } earliest_dl; | ||
463 | |||
464 | unsigned long dl_nr_migratory; | ||
465 | unsigned long dl_nr_total; | ||
466 | int overloaded; | ||
467 | |||
468 | /* | ||
469 | * Tasks on this rq that can be pushed away. They are kept in | ||
470 | * an rb-tree, ordered by tasks' deadlines, with caching | ||
471 | * of the leftmost (earliest deadline) element. | ||
472 | */ | ||
473 | struct rb_root pushable_dl_tasks_root; | ||
474 | struct rb_node *pushable_dl_tasks_leftmost; | ||
475 | #else | ||
476 | struct dl_bw dl_bw; | ||
477 | #endif | ||
478 | }; | ||
479 | |||
367 | #ifdef CONFIG_SMP | 480 | #ifdef CONFIG_SMP |
368 | 481 | ||
369 | /* | 482 | /* |
@@ -382,6 +495,15 @@ struct root_domain { | |||
382 | cpumask_var_t online; | 495 | cpumask_var_t online; |
383 | 496 | ||
384 | /* | 497 | /* |
498 | * The bit corresponding to a CPU gets set here if such CPU has more | ||
499 | * than one runnable -deadline task (as it is below for RT tasks). | ||
500 | */ | ||
501 | cpumask_var_t dlo_mask; | ||
502 | atomic_t dlo_count; | ||
503 | struct dl_bw dl_bw; | ||
504 | struct cpudl cpudl; | ||
505 | |||
506 | /* | ||
385 | * The "RT overload" flag: it gets set if a CPU has more than | 507 | * The "RT overload" flag: it gets set if a CPU has more than |
386 | * one runnable RT task. | 508 | * one runnable RT task. |
387 | */ | 509 | */ |
@@ -432,6 +554,7 @@ struct rq { | |||
432 | 554 | ||
433 | struct cfs_rq cfs; | 555 | struct cfs_rq cfs; |
434 | struct rt_rq rt; | 556 | struct rt_rq rt; |
557 | struct dl_rq dl; | ||
435 | 558 | ||
436 | #ifdef CONFIG_FAIR_GROUP_SCHED | 559 | #ifdef CONFIG_FAIR_GROUP_SCHED |
437 | /* list of leaf cfs_rq on this cpu: */ | 560 | /* list of leaf cfs_rq on this cpu: */ |
@@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void) | |||
827 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 950 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
828 | } | 951 | } |
829 | 952 | ||
830 | |||
831 | |||
832 | static inline int task_current(struct rq *rq, struct task_struct *p) | 953 | static inline int task_current(struct rq *rq, struct task_struct *p) |
833 | { | 954 | { |
834 | return rq->curr == p; | 955 | return rq->curr == p; |
@@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = { | |||
988 | #else | 1109 | #else |
989 | #define ENQUEUE_WAKING 0 | 1110 | #define ENQUEUE_WAKING 0 |
990 | #endif | 1111 | #endif |
1112 | #define ENQUEUE_REPLENISH 8 | ||
991 | 1113 | ||
992 | #define DEQUEUE_SLEEP 1 | 1114 | #define DEQUEUE_SLEEP 1 |
993 | 1115 | ||
@@ -1023,6 +1145,7 @@ struct sched_class { | |||
1023 | void (*set_curr_task) (struct rq *rq); | 1145 | void (*set_curr_task) (struct rq *rq); |
1024 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1146 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
1025 | void (*task_fork) (struct task_struct *p); | 1147 | void (*task_fork) (struct task_struct *p); |
1148 | void (*task_dead) (struct task_struct *p); | ||
1026 | 1149 | ||
1027 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1150 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
1028 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1151 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
@@ -1042,6 +1165,7 @@ struct sched_class { | |||
1042 | for (class = sched_class_highest; class; class = class->next) | 1165 | for (class = sched_class_highest; class; class = class->next) |
1043 | 1166 | ||
1044 | extern const struct sched_class stop_sched_class; | 1167 | extern const struct sched_class stop_sched_class; |
1168 | extern const struct sched_class dl_sched_class; | ||
1045 | extern const struct sched_class rt_sched_class; | 1169 | extern const struct sched_class rt_sched_class; |
1046 | extern const struct sched_class fair_sched_class; | 1170 | extern const struct sched_class fair_sched_class; |
1047 | extern const struct sched_class idle_sched_class; | 1171 | extern const struct sched_class idle_sched_class; |
@@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class; | |||
1051 | 1175 | ||
1052 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1176 | extern void update_group_power(struct sched_domain *sd, int cpu); |
1053 | 1177 | ||
1054 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1178 | extern void trigger_load_balance(struct rq *rq); |
1055 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1179 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
1056 | 1180 | ||
1057 | extern void idle_enter_fair(struct rq *this_rq); | 1181 | extern void idle_enter_fair(struct rq *this_rq); |
@@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
1068 | extern void sysrq_sched_debug_show(void); | 1192 | extern void sysrq_sched_debug_show(void); |
1069 | extern void sched_init_granularity(void); | 1193 | extern void sched_init_granularity(void); |
1070 | extern void update_max_interval(void); | 1194 | extern void update_max_interval(void); |
1195 | |||
1196 | extern void init_sched_dl_class(void); | ||
1071 | extern void init_sched_rt_class(void); | 1197 | extern void init_sched_rt_class(void); |
1072 | extern void init_sched_fair_class(void); | 1198 | extern void init_sched_fair_class(void); |
1199 | extern void init_sched_dl_class(void); | ||
1073 | 1200 | ||
1074 | extern void resched_task(struct task_struct *p); | 1201 | extern void resched_task(struct task_struct *p); |
1075 | extern void resched_cpu(int cpu); | 1202 | extern void resched_cpu(int cpu); |
@@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu); | |||
1077 | extern struct rt_bandwidth def_rt_bandwidth; | 1204 | extern struct rt_bandwidth def_rt_bandwidth; |
1078 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 1205 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
1079 | 1206 | ||
1207 | extern struct dl_bandwidth def_dl_bandwidth; | ||
1208 | extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); | ||
1209 | extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | ||
1210 | |||
1211 | unsigned long to_ratio(u64 period, u64 runtime); | ||
1212 | |||
1080 | extern void update_idle_cpu_load(struct rq *this_rq); | 1213 | extern void update_idle_cpu_load(struct rq *this_rq); |
1081 | 1214 | ||
1082 | extern void init_task_runnable_average(struct task_struct *p); | 1215 | extern void init_task_runnable_average(struct task_struct *p); |
@@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1353 | 1486 | ||
1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1487 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1488 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
1489 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | ||
1356 | 1490 | ||
1357 | extern void cfs_bandwidth_usage_inc(void); | 1491 | extern void cfs_bandwidth_usage_inc(void); |
1358 | extern void cfs_bandwidth_usage_dec(void); | 1492 | extern void cfs_bandwidth_usage_dec(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
103 | * Simple, special scheduling class for the per-CPU stop tasks: | 103 | * Simple, special scheduling class for the per-CPU stop tasks: |
104 | */ | 104 | */ |
105 | const struct sched_class stop_sched_class = { | 105 | const struct sched_class stop_sched_class = { |
106 | .next = &rt_sched_class, | 106 | .next = &dl_sched_class, |
107 | 107 | ||
108 | .enqueue_task = enqueue_task_stop, | 108 | .enqueue_task = enqueue_task_stop, |
109 | .dequeue_task = dequeue_task_stop, | 109 | .dequeue_task = dequeue_task_stop, |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 9a4500e4c189..8b93b3770f85 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -89,7 +89,7 @@ static void wakeup_softirqd(void) | |||
89 | * where hardirqs are disabled legitimately: | 89 | * where hardirqs are disabled legitimately: |
90 | */ | 90 | */ |
91 | #ifdef CONFIG_TRACE_IRQFLAGS | 91 | #ifdef CONFIG_TRACE_IRQFLAGS |
92 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) | 92 | void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) |
93 | { | 93 | { |
94 | unsigned long flags; | 94 | unsigned long flags; |
95 | 95 | ||
@@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
107 | /* | 107 | /* |
108 | * Were softirqs turned off above: | 108 | * Were softirqs turned off above: |
109 | */ | 109 | */ |
110 | if (softirq_count() == cnt) | 110 | if (softirq_count() == (cnt & SOFTIRQ_MASK)) |
111 | trace_softirqs_off(ip); | 111 | trace_softirqs_off(ip); |
112 | raw_local_irq_restore(flags); | 112 | raw_local_irq_restore(flags); |
113 | 113 | ||
114 | if (preempt_count() == cnt) | 114 | if (preempt_count() == cnt) |
115 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 115 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
116 | } | 116 | } |
117 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 117 | EXPORT_SYMBOL(__local_bh_disable_ip); |
118 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | ||
119 | { | ||
120 | preempt_count_add(cnt); | ||
121 | barrier(); | ||
122 | } | ||
123 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 118 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
124 | 119 | ||
125 | void local_bh_disable(void) | ||
126 | { | ||
127 | __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); | ||
128 | } | ||
129 | |||
130 | EXPORT_SYMBOL(local_bh_disable); | ||
131 | |||
132 | static void __local_bh_enable(unsigned int cnt) | 120 | static void __local_bh_enable(unsigned int cnt) |
133 | { | 121 | { |
134 | WARN_ON_ONCE(!irqs_disabled()); | 122 | WARN_ON_ONCE(!irqs_disabled()); |
135 | 123 | ||
136 | if (softirq_count() == cnt) | 124 | if (softirq_count() == (cnt & SOFTIRQ_MASK)) |
137 | trace_softirqs_on(_RET_IP_); | 125 | trace_softirqs_on(_RET_IP_); |
138 | preempt_count_sub(cnt); | 126 | preempt_count_sub(cnt); |
139 | } | 127 | } |
@@ -151,7 +139,7 @@ void _local_bh_enable(void) | |||
151 | 139 | ||
152 | EXPORT_SYMBOL(_local_bh_enable); | 140 | EXPORT_SYMBOL(_local_bh_enable); |
153 | 141 | ||
154 | static inline void _local_bh_enable_ip(unsigned long ip) | 142 | void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) |
155 | { | 143 | { |
156 | WARN_ON_ONCE(in_irq() || irqs_disabled()); | 144 | WARN_ON_ONCE(in_irq() || irqs_disabled()); |
157 | #ifdef CONFIG_TRACE_IRQFLAGS | 145 | #ifdef CONFIG_TRACE_IRQFLAGS |
@@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
166 | * Keep preemption disabled until we are done with | 154 | * Keep preemption disabled until we are done with |
167 | * softirq processing: | 155 | * softirq processing: |
168 | */ | 156 | */ |
169 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); | 157 | preempt_count_sub(cnt - 1); |
170 | 158 | ||
171 | if (unlikely(!in_interrupt() && local_softirq_pending())) { | 159 | if (unlikely(!in_interrupt() && local_softirq_pending())) { |
172 | /* | 160 | /* |
@@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
182 | #endif | 170 | #endif |
183 | preempt_check_resched(); | 171 | preempt_check_resched(); |
184 | } | 172 | } |
185 | 173 | EXPORT_SYMBOL(__local_bh_enable_ip); | |
186 | void local_bh_enable(void) | ||
187 | { | ||
188 | _local_bh_enable_ip(_RET_IP_); | ||
189 | } | ||
190 | EXPORT_SYMBOL(local_bh_enable); | ||
191 | |||
192 | void local_bh_enable_ip(unsigned long ip) | ||
193 | { | ||
194 | _local_bh_enable_ip(ip); | ||
195 | } | ||
196 | EXPORT_SYMBOL(local_bh_enable_ip); | ||
197 | 174 | ||
198 | /* | 175 | /* |
199 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, | 176 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, |
@@ -264,7 +241,7 @@ asmlinkage void __do_softirq(void) | |||
264 | pending = local_softirq_pending(); | 241 | pending = local_softirq_pending(); |
265 | account_irq_enter_time(current); | 242 | account_irq_enter_time(current); |
266 | 243 | ||
267 | __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); | 244 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); |
268 | in_hardirq = lockdep_softirq_start(); | 245 | in_hardirq = lockdep_softirq_start(); |
269 | 246 | ||
270 | cpu = smp_processor_id(); | 247 | cpu = smp_processor_id(); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a604726d0b..c8da99f905cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -385,13 +385,6 @@ static struct ctl_table kern_table[] = { | |||
385 | .proc_handler = proc_dointvec, | 385 | .proc_handler = proc_dointvec, |
386 | }, | 386 | }, |
387 | { | 387 | { |
388 | .procname = "numa_balancing_settle_count", | ||
389 | .data = &sysctl_numa_balancing_settle_count, | ||
390 | .maxlen = sizeof(unsigned int), | ||
391 | .mode = 0644, | ||
392 | .proc_handler = proc_dointvec, | ||
393 | }, | ||
394 | { | ||
395 | .procname = "numa_balancing_migrate_deferred", | 388 | .procname = "numa_balancing_migrate_deferred", |
396 | .data = &sysctl_numa_balancing_migrate_deferred, | 389 | .data = &sysctl_numa_balancing_migrate_deferred, |
397 | .maxlen = sizeof(unsigned int), | 390 | .maxlen = sizeof(unsigned int), |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d1ac2c..c833249ab0fb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -177,7 +177,7 @@ static bool can_stop_full_tick(void) | |||
177 | * TODO: kick full dynticks CPUs when | 177 | * TODO: kick full dynticks CPUs when |
178 | * sched_clock_stable is set. | 178 | * sched_clock_stable is set. |
179 | */ | 179 | */ |
180 | if (!sched_clock_stable) { | 180 | if (!sched_clock_stable()) { |
181 | trace_tick_stop(0, "unstable sched clock\n"); | 181 | trace_tick_stop(0, "unstable sched clock\n"); |
182 | /* | 182 | /* |
183 | * Don't allow the user to think they can get | 183 | * Don't allow the user to think they can get |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..294b8a271a04 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2558 | if (unlikely(test_time_stamp(delta))) { | 2558 | if (unlikely(test_time_stamp(delta))) { |
2559 | int local_clock_stable = 1; | 2559 | int local_clock_stable = 1; |
2560 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 2560 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
2561 | local_clock_stable = sched_clock_stable; | 2561 | local_clock_stable = sched_clock_stable(); |
2562 | #endif | 2562 | #endif |
2563 | WARN_ONCE(delta > (1ULL << 59), | 2563 | WARN_ONCE(delta > (1ULL << 59), |
2564 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", | 2564 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..6e32635e5e57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
17 | #include <linux/ftrace.h> | 17 | #include <linux/ftrace.h> |
18 | #include <linux/sched/rt.h> | 18 | #include <linux/sched/rt.h> |
19 | #include <linux/sched/deadline.h> | ||
19 | #include <trace/events/sched.h> | 20 | #include <trace/events/sched.h> |
20 | #include "trace.h" | 21 | #include "trace.h" |
21 | 22 | ||
@@ -27,6 +28,8 @@ static int wakeup_cpu; | |||
27 | static int wakeup_current_cpu; | 28 | static int wakeup_current_cpu; |
28 | static unsigned wakeup_prio = -1; | 29 | static unsigned wakeup_prio = -1; |
29 | static int wakeup_rt; | 30 | static int wakeup_rt; |
31 | static int wakeup_dl; | ||
32 | static int tracing_dl = 0; | ||
30 | 33 | ||
31 | static arch_spinlock_t wakeup_lock = | 34 | static arch_spinlock_t wakeup_lock = |
32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 35 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
@@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr) | |||
437 | { | 440 | { |
438 | wakeup_cpu = -1; | 441 | wakeup_cpu = -1; |
439 | wakeup_prio = -1; | 442 | wakeup_prio = -1; |
443 | tracing_dl = 0; | ||
440 | 444 | ||
441 | if (wakeup_task) | 445 | if (wakeup_task) |
442 | put_task_struct(wakeup_task); | 446 | put_task_struct(wakeup_task); |
@@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
472 | tracing_record_cmdline(p); | 476 | tracing_record_cmdline(p); |
473 | tracing_record_cmdline(current); | 477 | tracing_record_cmdline(current); |
474 | 478 | ||
475 | if ((wakeup_rt && !rt_task(p)) || | 479 | /* |
476 | p->prio >= wakeup_prio || | 480 | * Semantic is like this: |
477 | p->prio >= current->prio) | 481 | * - wakeup tracer handles all tasks in the system, independently |
482 | * from their scheduling class; | ||
483 | * - wakeup_rt tracer handles tasks belonging to sched_dl and | ||
484 | * sched_rt class; | ||
485 | * - wakeup_dl handles tasks belonging to sched_dl class only. | ||
486 | */ | ||
487 | if (tracing_dl || (wakeup_dl && !dl_task(p)) || | ||
488 | (wakeup_rt && !dl_task(p) && !rt_task(p)) || | ||
489 | (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) | ||
478 | return; | 490 | return; |
479 | 491 | ||
480 | pc = preempt_count(); | 492 | pc = preempt_count(); |
@@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
486 | arch_spin_lock(&wakeup_lock); | 498 | arch_spin_lock(&wakeup_lock); |
487 | 499 | ||
488 | /* check for races. */ | 500 | /* check for races. */ |
489 | if (!tracer_enabled || p->prio >= wakeup_prio) | 501 | if (!tracer_enabled || tracing_dl || |
502 | (!dl_task(p) && p->prio >= wakeup_prio)) | ||
490 | goto out_locked; | 503 | goto out_locked; |
491 | 504 | ||
492 | /* reset the trace */ | 505 | /* reset the trace */ |
@@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
496 | wakeup_current_cpu = wakeup_cpu; | 509 | wakeup_current_cpu = wakeup_cpu; |
497 | wakeup_prio = p->prio; | 510 | wakeup_prio = p->prio; |
498 | 511 | ||
512 | /* | ||
513 | * Once you start tracing a -deadline task, don't bother tracing | ||
514 | * another task until the first one wakes up. | ||
515 | */ | ||
516 | if (dl_task(p)) | ||
517 | tracing_dl = 1; | ||
518 | else | ||
519 | tracing_dl = 0; | ||
520 | |||
499 | wakeup_task = p; | 521 | wakeup_task = p; |
500 | get_task_struct(wakeup_task); | 522 | get_task_struct(wakeup_task); |
501 | 523 | ||
@@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) | |||
597 | 619 | ||
598 | static int wakeup_tracer_init(struct trace_array *tr) | 620 | static int wakeup_tracer_init(struct trace_array *tr) |
599 | { | 621 | { |
622 | wakeup_dl = 0; | ||
600 | wakeup_rt = 0; | 623 | wakeup_rt = 0; |
601 | return __wakeup_tracer_init(tr); | 624 | return __wakeup_tracer_init(tr); |
602 | } | 625 | } |
603 | 626 | ||
604 | static int wakeup_rt_tracer_init(struct trace_array *tr) | 627 | static int wakeup_rt_tracer_init(struct trace_array *tr) |
605 | { | 628 | { |
629 | wakeup_dl = 0; | ||
606 | wakeup_rt = 1; | 630 | wakeup_rt = 1; |
607 | return __wakeup_tracer_init(tr); | 631 | return __wakeup_tracer_init(tr); |
608 | } | 632 | } |
609 | 633 | ||
634 | static int wakeup_dl_tracer_init(struct trace_array *tr) | ||
635 | { | ||
636 | wakeup_dl = 1; | ||
637 | wakeup_rt = 0; | ||
638 | return __wakeup_tracer_init(tr); | ||
639 | } | ||
640 | |||
610 | static void wakeup_tracer_reset(struct trace_array *tr) | 641 | static void wakeup_tracer_reset(struct trace_array *tr) |
611 | { | 642 | { |
612 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; | 643 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; |
@@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
674 | .use_max_tr = true, | 705 | .use_max_tr = true, |
675 | }; | 706 | }; |
676 | 707 | ||
708 | static struct tracer wakeup_dl_tracer __read_mostly = | ||
709 | { | ||
710 | .name = "wakeup_dl", | ||
711 | .init = wakeup_dl_tracer_init, | ||
712 | .reset = wakeup_tracer_reset, | ||
713 | .start = wakeup_tracer_start, | ||
714 | .stop = wakeup_tracer_stop, | ||
715 | .wait_pipe = poll_wait_pipe, | ||
716 | .print_max = true, | ||
717 | .print_header = wakeup_print_header, | ||
718 | .print_line = wakeup_print_line, | ||
719 | .flags = &tracer_flags, | ||
720 | .set_flag = wakeup_set_flag, | ||
721 | .flag_changed = wakeup_flag_changed, | ||
722 | #ifdef CONFIG_FTRACE_SELFTEST | ||
723 | .selftest = trace_selftest_startup_wakeup, | ||
724 | #endif | ||
725 | .open = wakeup_trace_open, | ||
726 | .close = wakeup_trace_close, | ||
727 | .use_max_tr = true, | ||
728 | }; | ||
729 | |||
677 | __init static int init_wakeup_tracer(void) | 730 | __init static int init_wakeup_tracer(void) |
678 | { | 731 | { |
679 | int ret; | 732 | int ret; |
@@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void) | |||
686 | if (ret) | 739 | if (ret) |
687 | return ret; | 740 | return ret; |
688 | 741 | ||
742 | ret = register_tracer(&wakeup_dl_tracer); | ||
743 | if (ret) | ||
744 | return ret; | ||
745 | |||
689 | return 0; | 746 | return 0; |
690 | } | 747 | } |
691 | core_initcall(init_wakeup_tracer); | 748 | core_initcall(init_wakeup_tracer); |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
1022 | #ifdef CONFIG_SCHED_TRACER | 1022 | #ifdef CONFIG_SCHED_TRACER |
1023 | static int trace_wakeup_test_thread(void *data) | 1023 | static int trace_wakeup_test_thread(void *data) |
1024 | { | 1024 | { |
1025 | /* Make this a RT thread, doesn't need to be too high */ | 1025 | /* Make this a -deadline thread */ |
1026 | static const struct sched_param param = { .sched_priority = 5 }; | 1026 | static const struct sched_attr attr = { |
1027 | .sched_policy = SCHED_DEADLINE, | ||
1028 | .sched_runtime = 100000ULL, | ||
1029 | .sched_deadline = 10000000ULL, | ||
1030 | .sched_period = 10000000ULL | ||
1031 | }; | ||
1027 | struct completion *x = data; | 1032 | struct completion *x = data; |
1028 | 1033 | ||
1029 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 1034 | sched_setattr(current, &attr); |
1030 | 1035 | ||
1031 | /* Make it know we have a new prio */ | 1036 | /* Make it know we have a new prio */ |
1032 | complete(x); | 1037 | complete(x); |
@@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) | |||
1040 | /* we are awake, now wait to disappear */ | 1045 | /* we are awake, now wait to disappear */ |
1041 | while (!kthread_should_stop()) { | 1046 | while (!kthread_should_stop()) { |
1042 | /* | 1047 | /* |
1043 | * This is an RT task, do short sleeps to let | 1048 | * This will likely be the system top priority |
1044 | * others run. | 1049 | * task, do short sleeps to let others run. |
1045 | */ | 1050 | */ |
1046 | msleep(100); | 1051 | msleep(100); |
1047 | } | 1052 | } |
@@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1054 | { | 1059 | { |
1055 | unsigned long save_max = tracing_max_latency; | 1060 | unsigned long save_max = tracing_max_latency; |
1056 | struct task_struct *p; | 1061 | struct task_struct *p; |
1057 | struct completion isrt; | 1062 | struct completion is_ready; |
1058 | unsigned long count; | 1063 | unsigned long count; |
1059 | int ret; | 1064 | int ret; |
1060 | 1065 | ||
1061 | init_completion(&isrt); | 1066 | init_completion(&is_ready); |
1062 | 1067 | ||
1063 | /* create a high prio thread */ | 1068 | /* create a -deadline thread */ |
1064 | p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); | 1069 | p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); |
1065 | if (IS_ERR(p)) { | 1070 | if (IS_ERR(p)) { |
1066 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); | 1071 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); |
1067 | return -1; | 1072 | return -1; |
1068 | } | 1073 | } |
1069 | 1074 | ||
1070 | /* make sure the thread is running at an RT prio */ | 1075 | /* make sure the thread is running at -deadline policy */ |
1071 | wait_for_completion(&isrt); | 1076 | wait_for_completion(&is_ready); |
1072 | 1077 | ||
1073 | /* start the tracing */ | 1078 | /* start the tracing */ |
1074 | ret = tracer_init(trace, tr); | 1079 | ret = tracer_init(trace, tr); |
@@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1082 | 1087 | ||
1083 | while (p->on_rq) { | 1088 | while (p->on_rq) { |
1084 | /* | 1089 | /* |
1085 | * Sleep to make sure the RT thread is asleep too. | 1090 | * Sleep to make sure the -deadline thread is asleep too. |
1086 | * On virtual machines we can't rely on timings, | 1091 | * On virtual machines we can't rely on timings, |
1087 | * but we want to make sure this test still works. | 1092 | * but we want to make sure this test still works. |
1088 | */ | 1093 | */ |
1089 | msleep(100); | 1094 | msleep(100); |
1090 | } | 1095 | } |
1091 | 1096 | ||
1092 | init_completion(&isrt); | 1097 | init_completion(&is_ready); |
1093 | 1098 | ||
1094 | wake_up_process(p); | 1099 | wake_up_process(p); |
1095 | 1100 | ||
1096 | /* Wait for the task to wake up */ | 1101 | /* Wait for the task to wake up */ |
1097 | wait_for_completion(&isrt); | 1102 | wait_for_completion(&is_ready); |
1098 | 1103 | ||
1099 | /* stop the tracing. */ | 1104 | /* stop the tracing. */ |
1100 | tracing_stop(); | 1105 | tracing_stop(); |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4638e6f0238..82de78603686 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1623,11 +1623,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1623 | (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && | 1623 | (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && |
1624 | !sysctl_tcp_low_latency && | 1624 | !sysctl_tcp_low_latency && |
1625 | net_dma_find_channel()) { | 1625 | net_dma_find_channel()) { |
1626 | preempt_enable_no_resched(); | 1626 | preempt_enable(); |
1627 | tp->ucopy.pinned_list = | 1627 | tp->ucopy.pinned_list = |
1628 | dma_pin_iovec_pages(msg->msg_iov, len); | 1628 | dma_pin_iovec_pages(msg->msg_iov, len); |
1629 | } else { | 1629 | } else { |
1630 | preempt_enable_no_resched(); | 1630 | preempt_enable(); |
1631 | } | 1631 | } |
1632 | } | 1632 | } |
1633 | #endif | 1633 | #endif |