Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar: "Three fixes for scheduler crashes, each triggers in relatively rare, hardware environment dependent situations" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Rework sched_fair time accounting math64: Add mul_u64_u32_shr() sched: Remove PREEMPT_NEED_RESCHED from generic code sched: Initialize power_orig for overlapping groups
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-12-17 15:35:54 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-12-17 15:35:54 -0500
commit: dd0508093b79141e0044ca02f0acb6319f69f546 (patch)
tree: 5e0116949fd98cfaee9b118b6637203d17fa5dd0
parent: 1070d5ac193af55fd335ef2aacaf03c5fc4ee461 (diff)
parent: 9dbdb155532395ba000c5d5d187658b0e17e529f (diff)
8 files changed, 126 insertions, 107 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e903c71f7e69..0952ecd60eca 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@ config X86
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
        select ARCH_SUPPORTS_NUMA_BALANCING
+        select ARCH_SUPPORTS_INT128 if X86_64
        select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 8729723636fd..c8b051933b1b 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -8,6 +8,12 @@
 DECLARE_PER_CPU(int, __preempt_count);
 /*
+ * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
+ * that a decrement hitting 0 means we can and should reschedule.
+ */
+#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
+/*
 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
 * that think a non-zero value indicates we cannot preempt.
 */
@@ -74,6 +80,11 @@ static __always_inline void __preempt_count_sub(int val)
        __this_cpu_add_4(__preempt_count, -val);
 }
+/*
+ * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
+ * a decrement which hits zero means we have no preempt_count and should
+ * reschedule.
+ */
 static __always_inline bool __preempt_count_dec_and_test(void)
 {
        GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index ddf2b420ac8f..1cd3f5d767a8 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -3,13 +3,11 @@
 #include <linux/thread_info.h>
-/*
+#define PREEMPT_ENABLED (0)
- * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
- * that think a non-zero value indicates we cannot preempt.
- */
 static __always_inline int preempt_count(void)
 {
-        return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
+        return current_thread_info()->preempt_count;
 }
 static __always_inline int *preempt_count_ptr(void)
@@ -17,11 +15,6 @@ static __always_inline int *preempt_count_ptr(void)
        return &current_thread_info()->preempt_count;
 }
-/*
- * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
- * alternative is loosing a reschedule. Better schedule too often -- also this
- * should be a very rare operation.
- */
 static __always_inline void preempt_count_set(int pc)
 {
        *preempt_count_ptr() = pc;
@@ -41,28 +34,17 @@ static __always_inline void preempt_count_set(int pc)
        task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
 } while (0)
-/*
- * We fold the NEED_RESCHED bit into the preempt count such that
- * preempt_enable() can decrement and test for needing to reschedule with a
- * single instruction.
- *
- * We invert the actual bit, so that when the decrement hits 0 we know we both
- * need to resched (the bit is cleared) and can resched (no preempt count).
- */
 static __always_inline void set_preempt_need_resched(void)
 {
-        *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
 }
 static __always_inline void clear_preempt_need_resched(void)
 {
-        *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
 }
 static __always_inline bool test_preempt_need_resched(void)
 {
-        return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
+        return false;
 }
 /*
@@ -81,7 +63,12 @@ static __always_inline void __preempt_count_sub(int val)
 static __always_inline bool __preempt_count_dec_and_test(void)
 {
-        return !--*preempt_count_ptr();
+        /*
+         * Because of load-store architectures cannot do per-cpu atomic
+         * operations; we cannot use PREEMPT_NEED_RESCHED because it might get
+         * lost.
+         */
+        return !--*preempt_count_ptr() && tif_need_resched();
 }
 /*
@@ -89,7 +76,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 */
 static __always_inline bool should_resched(void)
 {
-        return unlikely(!*preempt_count_ptr());
+        return unlikely(!preempt_count() && tif_need_resched());
 }
 #ifdef CONFIG_PREEMPT
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 69ed5f5e9f6e..c45c089bfdac 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,4 +133,34 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
        return ret;
 }
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+        return (u64)(((unsigned __int128)a * mul) >> shift);
+}
+#endif /* mul_u64_u32_shr */
+#else
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+        u32 ah, al;
+        u64 ret;
+        al = a;
+        ah = a >> 32;
+        ret = ((u64)al * mul) >> shift;
+        if (ah)
+                ret += ((u64)ah * mul) << (32 - shift);
+        return ret;
+}
+#endif /* mul_u64_u32_shr */
+#endif
 #endif /* _LINUX_MATH64_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 768b037dfacb..53f97eb8dbc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -440,8 +440,6 @@ struct task_cputime {
                .sum_exec_runtime = 0,                          \
        }
-#define PREEMPT_ENABLED         (PREEMPT_NEED_RESCHED)
 #ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_DISABLED        (1 + PREEMPT_ENABLED)
 #else
@@ -932,7 +930,8 @@ struct pipe_inode_info;
 struct uts_namespace;
 struct load_weight {
-        unsigned long weight, inv_weight;
+        unsigned long weight;
+        u32 inv_weight;
 };
 struct sched_avg {
diff --git a/init/Kconfig b/init/Kconfig
index 79383d3aa5dc..4e5d96ab2034 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -809,6 +809,12 @@ config GENERIC_SCHED_CLOCK
 config ARCH_SUPPORTS_NUMA_BALANCING
        bool
+#
+# For architectures that know their GCC __int128 support is sound
+#
+config ARCH_SUPPORTS_INT128
+        bool
 # For architectures that (ab)use NUMA to represent different memory regions
 # all cpu-local but of different latencies, such as SuperH.
 #
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e85cda20ab2b..19af58f3a261 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5112,6 +5112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                sg->sgp->power_orig = sg->sgp->power;
                /*
                 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ade1a31..9030da7bcb15 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
        update_sysctl();
 }
-#if BITS_PER_LONG == 32
+#define WMULT_CONST     (~0U)
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
 #define WMULT_SHIFT     32
-/*
+static void __update_inv_weight(struct load_weight *lw)
- * Shift right and round:
+{
- */
+        unsigned long w;
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+        if (likely(lw->inv_weight))
+                return;
+        w = scale_load_down(lw->weight);
+        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                lw->inv_weight = 1;
+        else if (unlikely(!w))
+                lw->inv_weight = WMULT_CONST;
+        else
+                lw->inv_weight = WMULT_CONST / w;
+}
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
-static unsigned long
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
 {
-        u64 tmp;
+        u64 fact = scale_load_down(weight);
+        int shift = WMULT_SHIFT;
-        /*
+        __update_inv_weight(lw);
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
+        if (unlikely(fact >> 32)) {
-                unsigned long w = scale_load_down(lw->weight);
+                while (fact >> 32) {
+                        fact >>= 1;
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                        shift--;
-                        lw->inv_weight = 1;
+                }
-                else if (unlikely(!w))
-                        lw->inv_weight = WMULT_CONST;
-                else
-                        lw->inv_weight = WMULT_CONST / w;
        }
-        /*
+        /* hint to use a 32x32->64 mul */
-         * Check whether we'd overflow the 64-bit multiplication:
+        fact = (u64)(u32)fact * lw->inv_weight;
-         */
-        if (unlikely(tmp > WMULT_CONST))
+        while (fact >> 32) {
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                fact >>= 1;
-                        WMULT_SHIFT/2);
+                shift--;
-        else
+        }
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+        return mul_u64_u32_shr(delta_exec, fact, shift);
 }
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
 * delta /= w
 */
-static inline unsigned long
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
        return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
                }
-                slice = calc_delta_mine(slice, se->load.weight, load);
+                slice = __calc_delta(slice, se->load.weight, load);
        }
        return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 /*
- * Update the current task's runtime statistics. Skip current tasks that
+ * Update the current task's runtime statistics.
- * are not in our scheduling class.
 */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-              unsigned long delta_exec)
-{
-        unsigned long delta_exec_weighted;
-        schedstat_set(curr->statistics.exec_max,
-                      max((u64)delta_exec, curr->statistics.exec_max));
-        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        curr->vruntime += delta_exec_weighted;
-        update_min_vruntime(cfs_rq);
-}
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
        u64 now = rq_clock_task(rq_of(cfs_rq));
-        unsigned long delta_exec;
+        u64 delta_exec;
        if (unlikely(!curr))
                return;
-        /*
+        delta_exec = now - curr->exec_start;
-         * Get the amount of time the current task was running
+        if (unlikely((s64)delta_exec <= 0))
-         * since the last time we changed load (this cannot
-         * overflow on 32 bits):
-         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
-        if (!delta_exec)
                return;
-        __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
+        schedstat_set(curr->statistics.exec_max,
+                      max(delta_exec, curr->statistics.exec_max));
+        curr->sum_exec_runtime += delta_exec;
+        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        curr->vruntime += calc_delta_fair(delta_exec, curr);
+        update_min_vruntime(cfs_rq);
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        }
 }
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
-                                     unsigned long delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        return rq_clock_task(rq_of(cfs_rq));
 }
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-12-17 15:35:54 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-12-17 15:35:54 -0500
commit	dd0508093b79141e0044ca02f0acb6319f69f546 (patch)
tree	5e0116949fd98cfaee9b118b6637203d17fa5dd0
parent	1070d5ac193af55fd335ef2aacaf03c5fc4ee461 (diff)
parent	9dbdb155532395ba000c5d5d187658b0e17e529f (diff)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e903c71f7e69..0952ecd60eca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@ config X86
26	select HAVE_AOUT if X86_32	26	select HAVE_AOUT if X86_32
27	select HAVE_UNSTABLE_SCHED_CLOCK	27	select HAVE_UNSTABLE_SCHED_CLOCK
28	select ARCH_SUPPORTS_NUMA_BALANCING	28	select ARCH_SUPPORTS_NUMA_BALANCING
		29	select ARCH_SUPPORTS_INT128 if X86_64
29	select ARCH_WANTS_PROT_NUMA_PROT_NONE	30	select ARCH_WANTS_PROT_NUMA_PROT_NONE
30	select HAVE_IDE	31	select HAVE_IDE
31	select HAVE_OPROFILE	32	select HAVE_OPROFILE


diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 8729723636fd..c8b051933b1b 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h
@@ -8,6 +8,12 @@
8	DECLARE_PER_CPU(int, __preempt_count);	8	DECLARE_PER_CPU(int, __preempt_count);
9		9
10	/*	10	/*
		11	* We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
		12	* that a decrement hitting 0 means we can and should reschedule.
		13	*/
		14	#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
		15
		16	/*
11	* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users	17	* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
12	* that think a non-zero value indicates we cannot preempt.	18	* that think a non-zero value indicates we cannot preempt.
13	*/	19	*/
@@ -74,6 +80,11 @@ static __always_inline void __preempt_count_sub(int val)
74	__this_cpu_add_4(__preempt_count, -val);	80	__this_cpu_add_4(__preempt_count, -val);
75	}	81	}
76		82
		83	/*
		84	* Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
		85	* a decrement which hits zero means we have no preempt_count and should
		86	* reschedule.
		87	*/
77	static __always_inline bool __preempt_count_dec_and_test(void)	88	static __always_inline bool __preempt_count_dec_and_test(void)
78	{	89	{
79	GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");	90	GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");


diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index ddf2b420ac8f..1cd3f5d767a8 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h
@@ -3,13 +3,11 @@
3		3
4	#include <linux/thread_info.h>	4	#include <linux/thread_info.h>
5		5
6	/*	6	#define PREEMPT_ENABLED (0)
7	* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users	7
8	* that think a non-zero value indicates we cannot preempt.
9	*/
10	static __always_inline int preempt_count(void)	8	static __always_inline int preempt_count(void)
11	{	9	{
12	return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;	10	return current_thread_info()->preempt_count;
13	}	11	}
14		12
15	static __always_inline int *preempt_count_ptr(void)	13	static __always_inline int *preempt_count_ptr(void)
@@ -17,11 +15,6 @@ static __always_inline int *preempt_count_ptr(void)
17	return &current_thread_info()->preempt_count;	15	return &current_thread_info()->preempt_count;
18	}	16	}
19		17
20	/*
21	* We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
22	* alternative is loosing a reschedule. Better schedule too often -- also this
23	* should be a very rare operation.
24	*/
25	static __always_inline void preempt_count_set(int pc)	18	static __always_inline void preempt_count_set(int pc)
26	{	19	{
27	*preempt_count_ptr() = pc;	20	*preempt_count_ptr() = pc;
@@ -41,28 +34,17 @@ static __always_inline void preempt_count_set(int pc)
41	task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \	34	task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
42	} while (0)	35	} while (0)
43		36
44	/*
45	* We fold the NEED_RESCHED bit into the preempt count such that
46	* preempt_enable() can decrement and test for needing to reschedule with a
47	* single instruction.
48	*
49	* We invert the actual bit, so that when the decrement hits 0 we know we both
50	* need to resched (the bit is cleared) and can resched (no preempt count).
51	*/
52
53	static __always_inline void set_preempt_need_resched(void)	37	static __always_inline void set_preempt_need_resched(void)
54	{	38	{
55	*preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
56	}	39	}
57		40
58	static __always_inline void clear_preempt_need_resched(void)	41	static __always_inline void clear_preempt_need_resched(void)
59	{	42	{
60	*preempt_count_ptr() \|= PREEMPT_NEED_RESCHED;
61	}	43	}
62		44
63	static __always_inline bool test_preempt_need_resched(void)	45	static __always_inline bool test_preempt_need_resched(void)
64	{	46	{
65	return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);	47	return false;
66	}	48	}
67		49
68	/*	50	/*
@@ -81,7 +63,12 @@ static __always_inline void __preempt_count_sub(int val)
81		63
82	static __always_inline bool __preempt_count_dec_and_test(void)	64	static __always_inline bool __preempt_count_dec_and_test(void)
83	{	65	{
84	return !--*preempt_count_ptr();	66	/*
		67	* Because of load-store architectures cannot do per-cpu atomic
		68	* operations; we cannot use PREEMPT_NEED_RESCHED because it might get
		69	* lost.
		70	*/
		71	return !--*preempt_count_ptr() && tif_need_resched();
85	}	72	}
86		73
87	/*	74	/*
@@ -89,7 +76,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
89	*/	76	*/
90	static __always_inline bool should_resched(void)	77	static __always_inline bool should_resched(void)
91	{	78	{
92	return unlikely(!*preempt_count_ptr());	79	return unlikely(!preempt_count() && tif_need_resched());
93	}	80	}
94		81
95	#ifdef CONFIG_PREEMPT	82	#ifdef CONFIG_PREEMPT


diff --git a/include/linux/math64.h b/include/linux/math64.h index 69ed5f5e9f6e..c45c089bfdac 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h
@@ -133,4 +133,34 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
133	return ret;	133	return ret;
134	}	134	}
135		135
		136	#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
		137
		138	#ifndef mul_u64_u32_shr
		139	static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
		140	{
		141	return (u64)(((unsigned __int128)a * mul) >> shift);
		142	}
		143	#endif /* mul_u64_u32_shr */
		144
		145	#else
		146
		147	#ifndef mul_u64_u32_shr
		148	static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
		149	{
		150	u32 ah, al;
		151	u64 ret;
		152
		153	al = a;
		154	ah = a >> 32;
		155
		156	ret = ((u64)al * mul) >> shift;
		157	if (ah)
		158	ret += ((u64)ah * mul) << (32 - shift);
		159
		160	return ret;
		161	}
		162	#endif /* mul_u64_u32_shr */
		163
		164	#endif
		165
136	#endif /* _LINUX_MATH64_H */	166	#endif /* _LINUX_MATH64_H */


diff --git a/include/linux/sched.h b/include/linux/sched.h index 768b037dfacb..53f97eb8dbc7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -440,8 +440,6 @@ struct task_cputime {
440	.sum_exec_runtime = 0, \	440	.sum_exec_runtime = 0, \
441	}	441	}
442		442
443	#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
444
445	#ifdef CONFIG_PREEMPT_COUNT	443	#ifdef CONFIG_PREEMPT_COUNT
446	#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)	444	#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
447	#else	445	#else
@@ -932,7 +930,8 @@ struct pipe_inode_info;
932	struct uts_namespace;	930	struct uts_namespace;
933		931
934	struct load_weight {	932	struct load_weight {
935	unsigned long weight, inv_weight;	933	unsigned long weight;
		934	u32 inv_weight;
936	};	935	};
937		936
938	struct sched_avg {	937	struct sched_avg {


diff --git a/init/Kconfig b/init/Kconfig index 79383d3aa5dc..4e5d96ab2034 100644 --- a/init/Kconfig +++ b/init/Kconfig
@@ -809,6 +809,12 @@ config GENERIC_SCHED_CLOCK
809	config ARCH_SUPPORTS_NUMA_BALANCING	809	config ARCH_SUPPORTS_NUMA_BALANCING
810	bool	810	bool
811		811
		812	#
		813	# For architectures that know their GCC __int128 support is sound
		814	#
		815	config ARCH_SUPPORTS_INT128
		816	bool
		817
812	# For architectures that (ab)use NUMA to represent different memory regions	818	# For architectures that (ab)use NUMA to represent different memory regions
813	# all cpu-local but of different latencies, such as SuperH.	819	# all cpu-local but of different latencies, such as SuperH.
814	#	820	#


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e85cda20ab2b..19af58f3a261 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5112,6 +5112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5112	* die on a /0 trap.	5112	* die on a /0 trap.
5113	*/	5113	*/
5114	sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);	5114	sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
		5115	sg->sgp->power_orig = sg->sgp->power;
5115		5116
5116	/*	5117	/*
5117	* Make sure the first group of this domain contains the	5118	* Make sure the first group of this domain contains the


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ade1a31..9030da7bcb15 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178	update_sysctl();	178	update_sysctl();
179	}	179	}
180		180
181	#if BITS_PER_LONG == 32	181	#define WMULT_CONST (~0U)
182	# define WMULT_CONST (~0UL)
183	#else
184	# define WMULT_CONST (1UL << 32)
185	#endif
186
187	#define WMULT_SHIFT 32	182	#define WMULT_SHIFT 32
188		183
189	/*	184	static void __update_inv_weight(struct load_weight *lw)
190	* Shift right and round:	185	{
191	*/	186	unsigned long w;
192	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))	187
		188	if (likely(lw->inv_weight))
		189	return;
		190
		191	w = scale_load_down(lw->weight);
		192
		193	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
		194	lw->inv_weight = 1;
		195	else if (unlikely(!w))
		196	lw->inv_weight = WMULT_CONST;
		197	else
		198	lw->inv_weight = WMULT_CONST / w;
		199	}
193		200
194	/*	201	/*
195	* delta *= weight / lw	202	* delta_exec * weight / lw.weight
		203	* OR
		204	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
		205	*
		206	* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
		207	* we're guaranteed shift stays positive because inv_weight is guaranteed to
		208	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
		209	*
		210	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
		211	* weight/lw.weight <= 1, and therefore our shift will also be positive.
196	*/	212	*/
197	static unsigned long	213	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199	struct load_weight *lw)
200	{	214	{
201	u64 tmp;	215	u64 fact = scale_load_down(weight);
		216	int shift = WMULT_SHIFT;
202		217
203	/*	218	__update_inv_weight(lw);
204	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
206	* 2^SCHED_LOAD_RESOLUTION.
207	*/
208	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209	tmp = (u64)delta_exec * scale_load_down(weight);
210	else
211	tmp = (u64)delta_exec;
212		219
213	if (!lw->inv_weight) {	220	if (unlikely(fact >> 32)) {
214	unsigned long w = scale_load_down(lw->weight);	221	while (fact >> 32) {
215		222	fact >>= 1;
216	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))	223	shift--;
217	lw->inv_weight = 1;	224	}
218	else if (unlikely(!w))
219	lw->inv_weight = WMULT_CONST;
220	else
221	lw->inv_weight = WMULT_CONST / w;
222	}	225	}
223		226
224	/*	227	/* hint to use a 32x32->64 mul */
225	* Check whether we'd overflow the 64-bit multiplication:	228	fact = (u64)(u32)fact * lw->inv_weight;
226	*/	229
227	if (unlikely(tmp > WMULT_CONST))	230	while (fact >> 32) {
228	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,	231	fact >>= 1;
229	WMULT_SHIFT/2);	232	shift--;
230	else	233	}
231	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232		234
233	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);	235	return mul_u64_u32_shr(delta_exec, fact, shift);
234	}	236	}
235		237
236		238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity se, struct sched_entity pse)
443	#endif /* CONFIG_FAIR_GROUP_SCHED */	445	#endif /* CONFIG_FAIR_GROUP_SCHED */
444		446
445	static __always_inline	447	static __always_inline
446	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);	448	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447		449
448	/**************************************************************	450	/**************************************************************
449	* Scheduling class tree data structure manipulation methods:	451	* Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612	/*	614	/*
613	* delta /= w	615	* delta /= w
614	*/	616	*/
615	static inline unsigned long	617	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616	calc_delta_fair(unsigned long delta, struct sched_entity *se)
617	{	618	{
618	if (unlikely(se->load.weight != NICE_0_LOAD))	619	if (unlikely(se->load.weight != NICE_0_LOAD))
619	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);	620	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620		621
621	return delta;	622	return delta;
622	}	623	}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
665	update_load_add(&lw, se->load.weight);	666	update_load_add(&lw, se->load.weight);
666	load = &lw;	667	load = &lw;
667	}	668	}
668	slice = calc_delta_mine(slice, se->load.weight, load);	669	slice = __calc_delta(slice, se->load.weight, load);
669	}	670	}
670	return slice;	671	return slice;
671	}	672	}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703	#endif	704	#endif
704		705
705	/*	706	/*
706	* Update the current task's runtime statistics. Skip current tasks that	707	* Update the current task's runtime statistics.
707	* are not in our scheduling class.
708	*/	708	*/
709	static inline void
710	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
711	unsigned long delta_exec)
712	{
713	unsigned long delta_exec_weighted;
714
715	schedstat_set(curr->statistics.exec_max,
716	max((u64)delta_exec, curr->statistics.exec_max));
717
718	curr->sum_exec_runtime += delta_exec;
719	schedstat_add(cfs_rq, exec_clock, delta_exec);
720	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722	curr->vruntime += delta_exec_weighted;
723	update_min_vruntime(cfs_rq);
724	}
725
726	static void update_curr(struct cfs_rq *cfs_rq)	709	static void update_curr(struct cfs_rq *cfs_rq)
727	{	710	{
728	struct sched_entity *curr = cfs_rq->curr;	711	struct sched_entity *curr = cfs_rq->curr;
729	u64 now = rq_clock_task(rq_of(cfs_rq));	712	u64 now = rq_clock_task(rq_of(cfs_rq));
730	unsigned long delta_exec;	713	u64 delta_exec;
731		714
732	if (unlikely(!curr))	715	if (unlikely(!curr))
733	return;	716	return;
734		717
735	/*	718	delta_exec = now - curr->exec_start;
736	* Get the amount of time the current task was running	719	if (unlikely((s64)delta_exec <= 0))
737	* since the last time we changed load (this cannot
738	* overflow on 32 bits):
739	*/
740	delta_exec = (unsigned long)(now - curr->exec_start);
741	if (!delta_exec)
742	return;	720	return;
743		721
744	__update_curr(cfs_rq, curr, delta_exec);
745	curr->exec_start = now;	722	curr->exec_start = now;
746		723
		724	schedstat_set(curr->statistics.exec_max,
		725	max(delta_exec, curr->statistics.exec_max));
		726
		727	curr->sum_exec_runtime += delta_exec;
		728	schedstat_add(cfs_rq, exec_clock, delta_exec);
		729
		730	curr->vruntime += calc_delta_fair(delta_exec, curr);
		731	update_min_vruntime(cfs_rq);
		732
747	if (entity_is_task(curr)) {	733	if (entity_is_task(curr)) {
748	struct task_struct *curtask = task_of(curr);	734	struct task_struct *curtask = task_of(curr);
749		735
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015	}	3001	}
3016	}	3002	}
3017		3003
3018	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,	3004	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019	unsigned long delta_exec)
3020	{	3005	{
3021	/* dock delta_exec before expiring quota (as it could span periods) */	3006	/* dock delta_exec before expiring quota (as it could span periods) */
3022	cfs_rq->runtime_remaining -= delta_exec;	3007	cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034	}	3019	}
3035		3020
3036	static __always_inline	3021	static __always_inline
3037	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)	3022	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038	{	3023	{
3039	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)	3024	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
3040	return;	3025	return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574	return rq_clock_task(rq_of(cfs_rq));	3559	return rq_clock_task(rq_of(cfs_rq));
3575	}	3560	}
3576		3561
3577	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,	3562	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578	unsigned long delta_exec) {}
3579	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	3563	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}	3564	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	3565	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}