aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-12-17 15:35:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-12-17 15:35:54 -0500
commitdd0508093b79141e0044ca02f0acb6319f69f546 (patch)
tree5e0116949fd98cfaee9b118b6637203d17fa5dd0
parent1070d5ac193af55fd335ef2aacaf03c5fc4ee461 (diff)
parent9dbdb155532395ba000c5d5d187658b0e17e529f (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Three fixes for scheduler crashes, each triggers in relatively rare, hardware environment dependent situations" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Rework sched_fair time accounting math64: Add mul_u64_u32_shr() sched: Remove PREEMPT_NEED_RESCHED from generic code sched: Initialize power_orig for overlapping groups
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/preempt.h11
-rw-r--r--include/asm-generic/preempt.h35
-rw-r--r--include/linux/math64.h30
-rw-r--r--include/linux/sched.h5
-rw-r--r--init/Kconfig6
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/fair.c144
8 files changed, 126 insertions, 107 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e903c71f7e69..0952ecd60eca 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@ config X86
26 select HAVE_AOUT if X86_32 26 select HAVE_AOUT if X86_32
27 select HAVE_UNSTABLE_SCHED_CLOCK 27 select HAVE_UNSTABLE_SCHED_CLOCK
28 select ARCH_SUPPORTS_NUMA_BALANCING 28 select ARCH_SUPPORTS_NUMA_BALANCING
29 select ARCH_SUPPORTS_INT128 if X86_64
29 select ARCH_WANTS_PROT_NUMA_PROT_NONE 30 select ARCH_WANTS_PROT_NUMA_PROT_NONE
30 select HAVE_IDE 31 select HAVE_IDE
31 select HAVE_OPROFILE 32 select HAVE_OPROFILE
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 8729723636fd..c8b051933b1b 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -8,6 +8,12 @@
8DECLARE_PER_CPU(int, __preempt_count); 8DECLARE_PER_CPU(int, __preempt_count);
9 9
10/* 10/*
11 * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
12 * that a decrement hitting 0 means we can and should reschedule.
13 */
14#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
15
16/*
11 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users 17 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
12 * that think a non-zero value indicates we cannot preempt. 18 * that think a non-zero value indicates we cannot preempt.
13 */ 19 */
@@ -74,6 +80,11 @@ static __always_inline void __preempt_count_sub(int val)
74 __this_cpu_add_4(__preempt_count, -val); 80 __this_cpu_add_4(__preempt_count, -val);
75} 81}
76 82
83/*
84 * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
85 * a decrement which hits zero means we have no preempt_count and should
86 * reschedule.
87 */
77static __always_inline bool __preempt_count_dec_and_test(void) 88static __always_inline bool __preempt_count_dec_and_test(void)
78{ 89{
79 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); 90 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index ddf2b420ac8f..1cd3f5d767a8 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -3,13 +3,11 @@
3 3
4#include <linux/thread_info.h> 4#include <linux/thread_info.h>
5 5
6/* 6#define PREEMPT_ENABLED (0)
7 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users 7
8 * that think a non-zero value indicates we cannot preempt.
9 */
10static __always_inline int preempt_count(void) 8static __always_inline int preempt_count(void)
11{ 9{
12 return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; 10 return current_thread_info()->preempt_count;
13} 11}
14 12
15static __always_inline int *preempt_count_ptr(void) 13static __always_inline int *preempt_count_ptr(void)
@@ -17,11 +15,6 @@ static __always_inline int *preempt_count_ptr(void)
17 return &current_thread_info()->preempt_count; 15 return &current_thread_info()->preempt_count;
18} 16}
19 17
20/*
21 * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
22 * alternative is loosing a reschedule. Better schedule too often -- also this
23 * should be a very rare operation.
24 */
25static __always_inline void preempt_count_set(int pc) 18static __always_inline void preempt_count_set(int pc)
26{ 19{
27 *preempt_count_ptr() = pc; 20 *preempt_count_ptr() = pc;
@@ -41,28 +34,17 @@ static __always_inline void preempt_count_set(int pc)
41 task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ 34 task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
42} while (0) 35} while (0)
43 36
44/*
45 * We fold the NEED_RESCHED bit into the preempt count such that
46 * preempt_enable() can decrement and test for needing to reschedule with a
47 * single instruction.
48 *
49 * We invert the actual bit, so that when the decrement hits 0 we know we both
50 * need to resched (the bit is cleared) and can resched (no preempt count).
51 */
52
53static __always_inline void set_preempt_need_resched(void) 37static __always_inline void set_preempt_need_resched(void)
54{ 38{
55 *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
56} 39}
57 40
58static __always_inline void clear_preempt_need_resched(void) 41static __always_inline void clear_preempt_need_resched(void)
59{ 42{
60 *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
61} 43}
62 44
63static __always_inline bool test_preempt_need_resched(void) 45static __always_inline bool test_preempt_need_resched(void)
64{ 46{
65 return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); 47 return false;
66} 48}
67 49
68/* 50/*
@@ -81,7 +63,12 @@ static __always_inline void __preempt_count_sub(int val)
81 63
82static __always_inline bool __preempt_count_dec_and_test(void) 64static __always_inline bool __preempt_count_dec_and_test(void)
83{ 65{
84 return !--*preempt_count_ptr(); 66 /*
67 * Because of load-store architectures cannot do per-cpu atomic
68 * operations; we cannot use PREEMPT_NEED_RESCHED because it might get
69 * lost.
70 */
71 return !--*preempt_count_ptr() && tif_need_resched();
85} 72}
86 73
87/* 74/*
@@ -89,7 +76,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
89 */ 76 */
90static __always_inline bool should_resched(void) 77static __always_inline bool should_resched(void)
91{ 78{
92 return unlikely(!*preempt_count_ptr()); 79 return unlikely(!preempt_count() && tif_need_resched());
93} 80}
94 81
95#ifdef CONFIG_PREEMPT 82#ifdef CONFIG_PREEMPT
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 69ed5f5e9f6e..c45c089bfdac 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,4 +133,34 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
133 return ret; 133 return ret;
134} 134}
135 135
136#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
137
138#ifndef mul_u64_u32_shr
139static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
140{
141 return (u64)(((unsigned __int128)a * mul) >> shift);
142}
143#endif /* mul_u64_u32_shr */
144
145#else
146
147#ifndef mul_u64_u32_shr
148static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
149{
150 u32 ah, al;
151 u64 ret;
152
153 al = a;
154 ah = a >> 32;
155
156 ret = ((u64)al * mul) >> shift;
157 if (ah)
158 ret += ((u64)ah * mul) << (32 - shift);
159
160 return ret;
161}
162#endif /* mul_u64_u32_shr */
163
164#endif
165
136#endif /* _LINUX_MATH64_H */ 166#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 768b037dfacb..53f97eb8dbc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -440,8 +440,6 @@ struct task_cputime {
440 .sum_exec_runtime = 0, \ 440 .sum_exec_runtime = 0, \
441 } 441 }
442 442
443#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
444
445#ifdef CONFIG_PREEMPT_COUNT 443#ifdef CONFIG_PREEMPT_COUNT
446#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) 444#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
447#else 445#else
@@ -932,7 +930,8 @@ struct pipe_inode_info;
932struct uts_namespace; 930struct uts_namespace;
933 931
934struct load_weight { 932struct load_weight {
935 unsigned long weight, inv_weight; 933 unsigned long weight;
934 u32 inv_weight;
936}; 935};
937 936
938struct sched_avg { 937struct sched_avg {
diff --git a/init/Kconfig b/init/Kconfig
index 79383d3aa5dc..4e5d96ab2034 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -809,6 +809,12 @@ config GENERIC_SCHED_CLOCK
809config ARCH_SUPPORTS_NUMA_BALANCING 809config ARCH_SUPPORTS_NUMA_BALANCING
810 bool 810 bool
811 811
812#
813# For architectures that know their GCC __int128 support is sound
814#
815config ARCH_SUPPORTS_INT128
816 bool
817
812# For architectures that (ab)use NUMA to represent different memory regions 818# For architectures that (ab)use NUMA to represent different memory regions
813# all cpu-local but of different latencies, such as SuperH. 819# all cpu-local but of different latencies, such as SuperH.
814# 820#
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e85cda20ab2b..19af58f3a261 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5112,6 +5112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5112 * die on a /0 trap. 5112 * die on a /0 trap.
5113 */ 5113 */
5114 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5114 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5115 sg->sgp->power_orig = sg->sgp->power;
5115 5116
5116 /* 5117 /*
5117 * Make sure the first group of this domain contains the 5118 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ade1a31..9030da7bcb15 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178 update_sysctl(); 178 update_sysctl();
179} 179}
180 180
181#if BITS_PER_LONG == 32 181#define WMULT_CONST (~0U)
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32 182#define WMULT_SHIFT 32
188 183
189/* 184static void __update_inv_weight(struct load_weight *lw)
190 * Shift right and round: 185{
191 */ 186 unsigned long w;
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
193 200
194/* 201/*
195 * delta *= weight / lw 202 * delta_exec * weight / lw.weight
203 * OR
204 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205 *
206 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207 * we're guaranteed shift stays positive because inv_weight is guaranteed to
208 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209 *
210 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211 * weight/lw.weight <= 1, and therefore our shift will also be positive.
196 */ 212 */
197static unsigned long 213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{ 214{
201 u64 tmp; 215 u64 fact = scale_load_down(weight);
216 int shift = WMULT_SHIFT;
202 217
203 /* 218 __update_inv_weight(lw);
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212 219
213 if (!lw->inv_weight) { 220 if (unlikely(fact >> 32)) {
214 unsigned long w = scale_load_down(lw->weight); 221 while (fact >> 32) {
215 222 fact >>= 1;
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 223 shift--;
217 lw->inv_weight = 1; 224 }
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222 } 225 }
223 226
224 /* 227 /* hint to use a 32x32->64 mul */
225 * Check whether we'd overflow the 64-bit multiplication: 228 fact = (u64)(u32)fact * lw->inv_weight;
226 */ 229
227 if (unlikely(tmp > WMULT_CONST)) 230 while (fact >> 32) {
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 231 fact >>= 1;
229 WMULT_SHIFT/2); 232 shift--;
230 else 233 }
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 234
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 235 return mul_u64_u32_shr(delta_exec, fact, shift);
234} 236}
235 237
236 238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 445#endif /* CONFIG_FAIR_GROUP_SCHED */
444 446
445static __always_inline 447static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); 448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447 449
448/************************************************************** 450/**************************************************************
449 * Scheduling class tree data structure manipulation methods: 451 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612/* 614/*
613 * delta /= w 615 * delta /= w
614 */ 616 */
615static inline unsigned long 617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{ 618{
618 if (unlikely(se->load.weight != NICE_0_LOAD)) 619 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620 621
621 return delta; 622 return delta;
622} 623}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665 update_load_add(&lw, se->load.weight); 666 update_load_add(&lw, se->load.weight);
666 load = &lw; 667 load = &lw;
667 } 668 }
668 slice = calc_delta_mine(slice, se->load.weight, load); 669 slice = __calc_delta(slice, se->load.weight, load);
669 } 670 }
670 return slice; 671 return slice;
671} 672}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703#endif 704#endif
704 705
705/* 706/*
706 * Update the current task's runtime statistics. Skip current tasks that 707 * Update the current task's runtime statistics.
707 * are not in our scheduling class.
708 */ 708 */
709static inline void
710__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
711 unsigned long delta_exec)
712{
713 unsigned long delta_exec_weighted;
714
715 schedstat_set(curr->statistics.exec_max,
716 max((u64)delta_exec, curr->statistics.exec_max));
717
718 curr->sum_exec_runtime += delta_exec;
719 schedstat_add(cfs_rq, exec_clock, delta_exec);
720 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722 curr->vruntime += delta_exec_weighted;
723 update_min_vruntime(cfs_rq);
724}
725
726static void update_curr(struct cfs_rq *cfs_rq) 709static void update_curr(struct cfs_rq *cfs_rq)
727{ 710{
728 struct sched_entity *curr = cfs_rq->curr; 711 struct sched_entity *curr = cfs_rq->curr;
729 u64 now = rq_clock_task(rq_of(cfs_rq)); 712 u64 now = rq_clock_task(rq_of(cfs_rq));
730 unsigned long delta_exec; 713 u64 delta_exec;
731 714
732 if (unlikely(!curr)) 715 if (unlikely(!curr))
733 return; 716 return;
734 717
735 /* 718 delta_exec = now - curr->exec_start;
736 * Get the amount of time the current task was running 719 if (unlikely((s64)delta_exec <= 0))
737 * since the last time we changed load (this cannot
738 * overflow on 32 bits):
739 */
740 delta_exec = (unsigned long)(now - curr->exec_start);
741 if (!delta_exec)
742 return; 720 return;
743 721
744 __update_curr(cfs_rq, curr, delta_exec);
745 curr->exec_start = now; 722 curr->exec_start = now;
746 723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
747 if (entity_is_task(curr)) { 733 if (entity_is_task(curr)) {
748 struct task_struct *curtask = task_of(curr); 734 struct task_struct *curtask = task_of(curr);
749 735
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015 } 3001 }
3016} 3002}
3017 3003
3018static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3004static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019 unsigned long delta_exec)
3020{ 3005{
3021 /* dock delta_exec before expiring quota (as it could span periods) */ 3006 /* dock delta_exec before expiring quota (as it could span periods) */
3022 cfs_rq->runtime_remaining -= delta_exec; 3007 cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034} 3019}
3035 3020
3036static __always_inline 3021static __always_inline
3037void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) 3022void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038{ 3023{
3039 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 3024 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3040 return; 3025 return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574 return rq_clock_task(rq_of(cfs_rq)); 3559 return rq_clock_task(rq_of(cfs_rq));
3575} 3560}
3576 3561
3577static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3562static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578 unsigned long delta_exec) {}
3579static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3563static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3564static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3565static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}