7 files changed, 99 insertions, 56 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 7024c12f7bfe..400873450e33 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
 # ifdef CONFIG_CONTEXT_TRACKING
    extern asmlinkage void ___preempt_schedule_context(void);
 #   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
+    extern asmlinkage void preempt_schedule_context(void);
 # endif
 #endif
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d7387d13..b932be9f5c5b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -13,7 +13,7 @@
 #define CLONE_VFORK     0x00004000      /* set if the parent wants the child to wake it up on mm_release */
 #define CLONE_PARENT    0x00008000      /* set if we want to have the same parent as the cloner */
 #define CLONE_THREAD    0x00010000      /* Same thread group? */
-#define CLONE_NEWNS     0x00020000      /* New namespace group? */
+#define CLONE_NEWNS     0x00020000      /* New mount namespace group */
 #define CLONE_SYSVSEM   0x00040000      /* share system V SEM_UNDO semantics */
 #define CLONE_SETTLS    0x00080000      /* create a new TLS for the child */
 #define CLONE_PARENT_SETTID     0x00100000      /* set the TID in the parent */
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
 }
 NOKPROBE_SYMBOL(context_tracking_user_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
-        enum ctx_state prev_ctx;
-        if (likely(!preemptible()))
-                return;
-        /*
-         * Need to disable preemption in case user_exit() is traced
-         * and the tracer calls preempt_enable_notrace() causing
-         * an infinite recursion.
-         */
-        preempt_disable_notrace();
-        prev_ctx = exception_enter();
-        preempt_enable_no_resched_notrace();
-        preempt_schedule();
-        preempt_disable_notrace();
-        exception_exit(prev_ctx);
-        preempt_enable_notrace();
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
 /**
 * context_tracking_user_exit - Inform the context tracking that the CPU is
 *                              exiting userspace mode and entering the kernel.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..240157c13ddc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+        enum ctx_state prev_ctx;
+        if (likely(!preemptible()))
+                return;
+        do {
+                __preempt_count_add(PREEMPT_ACTIVE);
+                /*
+                 * Needs preempt disabled in case user_exit() is traced
+                 * and the tracer calls preempt_enable_notrace() causing
+                 * an infinite recursion.
+                 */
+                prev_ctx = exception_enter();
+                __schedule();
+                exception_exit(prev_ctx);
+                __preempt_count_sub(PREEMPT_ACTIVE);
+                barrier();
+        } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
 #endif /* CONFIG_PREEMPT */
 /*
@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
        sched_offline_group(tg);
 }
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+        sched_move_task(task);
+}
 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
        .css_free       = cpu_cgroup_css_free,
        .css_online     = cpu_cgroup_css_online,
        .css_offline    = cpu_cgroup_css_offline,
+        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..5285332392d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
        }
        /*
-         * We need to take care of a possible races here. In fact, the
+         * We need to take care of several possible races here:
-         * task might have changed its scheduling policy to something
+         *
-         * different from SCHED_DEADLINE or changed its reservation
+         *   - the task might have changed its scheduling policy
-         * parameters (through sched_setattr()).
+         *     to something different than SCHED_DEADLINE
+         *   - the task might have changed its reservation parameters
+         *     (through sched_setattr())
+         *   - the task might have been boosted by someone else and
+         *     might be in the boosting/deboosting path
+         *
+         * In all this cases we bail out, as the task is already
+         * in the runqueue or is going to be enqueued back anyway.
         */
-        if (!dl_task(p) || dl_se->dl_new)
+        if (!dl_task(p) || dl_se->dl_new ||
+            dl_se->dl_boosted || !dl_se->dl_throttled)
                goto unlock;
        sched_clock_tick();
@@ -532,7 +540,7 @@ again:
        dl_se->dl_yielded = 0;
        if (task_on_rq_queued(p)) {
                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-                if (task_has_dl_policy(rq->curr))
+                if (dl_task(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
                else
                        resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
         * smaller than our one... OTW we keep our runtime and
         * deadline.
         */
-        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
                pi_se = &pi_task->dl;
+        } else if (!dl_prio(p->normal_prio)) {
+                /*
+                 * Special case in which we have a !SCHED_DEADLINE task
+                 * that is going to be deboosted, but exceedes its
+                 * runtime while doing so. No point in replenishing
+                 * it, as it's going to return back to its original
+                 * scheduling class after this.
+                 */
+                BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+                return;
+        }
        /*
         * If p is throttled, we do nothing. In fact, if it exhausted
@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
                        /* Only reschedule if pushing failed */
                        check_resched = 0;
 #endif /* CONFIG_SMP */
-                if (check_resched && task_has_dl_policy(rq->curr))
+                if (check_resched) {
-                        check_preempt_curr_dl(rq, p, 0);
+                        if (dl_task(rq->curr))
+                                check_preempt_curr_dl(rq, p, 0);
+                        else
+                                resched_curr(rq);
+                }
        }
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..34baa60f8a7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 static unsigned int task_scan_min(struct task_struct *p)
 {
+        unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
        unsigned int scan, floor;
        unsigned int windows = 1;
-        if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+        if (scan_size < MAX_SCAN_WINDOW)
-                windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+                windows = MAX_SCAN_WINDOW / scan_size;
        floor = 1000 / windows;
        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
        long moveimp = imp;
        rcu_read_lock();
-        cur = ACCESS_ONCE(dst_rq->curr);
-        if (cur->pid == 0) /* idle */
+        raw_spin_lock_irq(&dst_rq->lock);
+        cur = dst_rq->curr;
+        /*
+         * No need to move the exiting task, and this ensures that ->curr
+         * wasn't reaped and thus get_task_struct() in task_numa_assign()
+         * is safe under RCU read lock.
+         * Note that rcu_read_lock() itself can't protect from the final
+         * put_task_struct() after the last schedule().
+         */
+        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+        raw_spin_unlock_irq(&dst_rq->lock);
        /*
         * "imp" is the fault differential for the source task between the
@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
                 * scanning faster if shared accesses dominate as it may
                 * simply bounce migrations uselessly
                 */
-                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
        }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d9fe74..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_numa_balancing_scan_size,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
        },
        {
                .procname       = "numa_balancing",

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7024c12f7bfe..400873450e33 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h
@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
105	# ifdef CONFIG_CONTEXT_TRACKING	105	# ifdef CONFIG_CONTEXT_TRACKING
106	extern asmlinkage void ___preempt_schedule_context(void);	106	extern asmlinkage void ___preempt_schedule_context(void);
107	# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")	107	# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
		108	extern asmlinkage void preempt_schedule_context(void);
108	# endif	109	# endif
109	#endif	110	#endif
110		111


diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 34f9d7387d13..b932be9f5c5b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h
@@ -13,7 +13,7 @@
13	#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */	13	#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
14	#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */	14	#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
15	#define CLONE_THREAD 0x00010000 /* Same thread group? */	15	#define CLONE_THREAD 0x00010000 /* Same thread group? */
16	#define CLONE_NEWNS 0x00020000 /* New namespace group? */	16	#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
17	#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */	17	#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
18	#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */	18	#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
19	#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */	19	#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */


diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
107	}	107	}
108	NOKPROBE_SYMBOL(context_tracking_user_enter);	108	NOKPROBE_SYMBOL(context_tracking_user_enter);
109		109
110	#ifdef CONFIG_PREEMPT
111	/**
112	* preempt_schedule_context - preempt_schedule called by tracing
113	*
114	* The tracing infrastructure uses preempt_enable_notrace to prevent
115	* recursion and tracing preempt enabling caused by the tracing
116	* infrastructure itself. But as tracing can happen in areas coming
117	* from userspace or just about to enter userspace, a preempt enable
118	* can occur before user_exit() is called. This will cause the scheduler
119	* to be called when the system is still in usermode.
120	*
121	* To prevent this, the preempt_enable_notrace will use this function
122	* instead of preempt_schedule() to exit user context if needed before
123	* calling the scheduler.
124	*/
125	asmlinkage __visible void __sched notrace preempt_schedule_context(void)
126	{
127	enum ctx_state prev_ctx;
128
129	if (likely(!preemptible()))
130	return;
131
132	/*
133	* Need to disable preemption in case user_exit() is traced
134	* and the tracer calls preempt_enable_notrace() causing
135	* an infinite recursion.
136	*/
137	preempt_disable_notrace();
138	prev_ctx = exception_enter();
139	preempt_enable_no_resched_notrace();
140
141	preempt_schedule();
142
143	preempt_disable_notrace();
144	exception_exit(prev_ctx);
145	preempt_enable_notrace();
146	}
147	EXPORT_SYMBOL_GPL(preempt_schedule_context);
148	#endif /* CONFIG_PREEMPT */
149
150	/**	110	/**
151	* context_tracking_user_exit - Inform the context tracking that the CPU is	111	* context_tracking_user_exit - Inform the context tracking that the CPU is
152	* exiting userspace mode and entering the kernel.	112	* exiting userspace mode and entering the kernel.


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44999505e1bf..240157c13ddc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2951	}	2951	}
2952	NOKPROBE_SYMBOL(preempt_schedule);	2952	NOKPROBE_SYMBOL(preempt_schedule);
2953	EXPORT_SYMBOL(preempt_schedule);	2953	EXPORT_SYMBOL(preempt_schedule);
		2954
		2955	#ifdef CONFIG_CONTEXT_TRACKING
		2956	/**
		2957	* preempt_schedule_context - preempt_schedule called by tracing
		2958	*
		2959	* The tracing infrastructure uses preempt_enable_notrace to prevent
		2960	* recursion and tracing preempt enabling caused by the tracing
		2961	* infrastructure itself. But as tracing can happen in areas coming
		2962	* from userspace or just about to enter userspace, a preempt enable
		2963	* can occur before user_exit() is called. This will cause the scheduler
		2964	* to be called when the system is still in usermode.
		2965	*
		2966	* To prevent this, the preempt_enable_notrace will use this function
		2967	* instead of preempt_schedule() to exit user context if needed before
		2968	* calling the scheduler.
		2969	*/
		2970	asmlinkage __visible void __sched notrace preempt_schedule_context(void)
		2971	{
		2972	enum ctx_state prev_ctx;
		2973
		2974	if (likely(!preemptible()))
		2975	return;
		2976
		2977	do {
		2978	__preempt_count_add(PREEMPT_ACTIVE);
		2979	/*
		2980	* Needs preempt disabled in case user_exit() is traced
		2981	* and the tracer calls preempt_enable_notrace() causing
		2982	* an infinite recursion.
		2983	*/
		2984	prev_ctx = exception_enter();
		2985	__schedule();
		2986	exception_exit(prev_ctx);
		2987
		2988	__preempt_count_sub(PREEMPT_ACTIVE);
		2989	barrier();
		2990	} while (need_resched());
		2991	}
		2992	EXPORT_SYMBOL_GPL(preempt_schedule_context);
		2993	#endif /* CONFIG_CONTEXT_TRACKING */
		2994
2954	#endif /* CONFIG_PREEMPT */	2995	#endif /* CONFIG_PREEMPT */
2955		2996
2956	/*	2997	/*
@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7833	sched_offline_group(tg);	7874	sched_offline_group(tg);
7834	}	7875	}
7835		7876
		7877	static void cpu_cgroup_fork(struct task_struct *task)
		7878	{
		7879	sched_move_task(task);
		7880	}
		7881
7836	static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,	7882	static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7837	struct cgroup_taskset *tset)	7883	struct cgroup_taskset *tset)
7838	{	7884	{
@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8205	.css_free = cpu_cgroup_css_free,	8251	.css_free = cpu_cgroup_css_free,
8206	.css_online = cpu_cgroup_css_online,	8252	.css_online = cpu_cgroup_css_online,
8207	.css_offline = cpu_cgroup_css_offline,	8253	.css_offline = cpu_cgroup_css_offline,
		8254	.fork = cpu_cgroup_fork,
8208	.can_attach = cpu_cgroup_can_attach,	8255	.can_attach = cpu_cgroup_can_attach,
8209	.attach = cpu_cgroup_attach,	8256	.attach = cpu_cgroup_attach,
8210	.exit = cpu_cgroup_exit,	8257	.exit = cpu_cgroup_exit,


diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1b..5285332392d5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
518	}	518	}
519		519
520	/*	520	/*
521	* We need to take care of a possible races here. In fact, the	521	* We need to take care of several possible races here:
522	* task might have changed its scheduling policy to something	522	*
523	* different from SCHED_DEADLINE or changed its reservation	523	* - the task might have changed its scheduling policy
524	* parameters (through sched_setattr()).	524	* to something different than SCHED_DEADLINE
		525	* - the task might have changed its reservation parameters
		526	* (through sched_setattr())
		527	* - the task might have been boosted by someone else and
		528	* might be in the boosting/deboosting path
		529	*
		530	* In all this cases we bail out, as the task is already
		531	* in the runqueue or is going to be enqueued back anyway.
525	*/	532	*/
526	if (!dl_task(p) \|\| dl_se->dl_new)	533	if (!dl_task(p) \|\| dl_se->dl_new \|\|
		534	dl_se->dl_boosted \|\| !dl_se->dl_throttled)
527	goto unlock;	535	goto unlock;
528		536
529	sched_clock_tick();	537	sched_clock_tick();
@@ -532,7 +540,7 @@ again:
532	dl_se->dl_yielded = 0;	540	dl_se->dl_yielded = 0;
533	if (task_on_rq_queued(p)) {	541	if (task_on_rq_queued(p)) {
534	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);	542	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535	if (task_has_dl_policy(rq->curr))	543	if (dl_task(rq->curr))
536	check_preempt_curr_dl(rq, p, 0);	544	check_preempt_curr_dl(rq, p, 0);
537	else	545	else
538	resched_curr(rq);	546	resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq rq, struct task_struct p, int flags)
847	* smaller than our one... OTW we keep our runtime and	855	* smaller than our one... OTW we keep our runtime and
848	* deadline.	856	* deadline.
849	*/	857	*/
850	if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))	858	if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
851	pi_se = &pi_task->dl;	859	pi_se = &pi_task->dl;
		860	} else if (!dl_prio(p->normal_prio)) {
		861	/*
		862	* Special case in which we have a !SCHED_DEADLINE task
		863	* that is going to be deboosted, but exceedes its
		864	* runtime while doing so. No point in replenishing
		865	* it, as it's going to return back to its original
		866	* scheduling class after this.
		867	*/
		868	BUG_ON(!p->dl.dl_boosted \|\| flags != ENQUEUE_REPLENISH);
		869	return;
		870	}
852		871
853	/*	872	/*
854	* If p is throttled, we do nothing. In fact, if it exhausted	873	* If p is throttled, we do nothing. In fact, if it exhausted
@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq rq, struct task_struct p)
1607	/* Only reschedule if pushing failed */	1626	/* Only reschedule if pushing failed */
1608	check_resched = 0;	1627	check_resched = 0;
1609	#endif /* CONFIG_SMP */	1628	#endif /* CONFIG_SMP */
1610	if (check_resched && task_has_dl_policy(rq->curr))	1629	if (check_resched) {
1611	check_preempt_curr_dl(rq, p, 0);	1630	if (dl_task(rq->curr))
		1631	check_preempt_curr_dl(rq, p, 0);
		1632	else
		1633	resched_curr(rq);
		1634	}
1612	}	1635	}
1613	}	1636	}
1614		1637


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf3e708..34baa60f8a7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
828		828
829	static unsigned int task_scan_min(struct task_struct *p)	829	static unsigned int task_scan_min(struct task_struct *p)
830	{	830	{
		831	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
831	unsigned int scan, floor;	832	unsigned int scan, floor;
832	unsigned int windows = 1;	833	unsigned int windows = 1;
833		834
834	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)	835	if (scan_size < MAX_SCAN_WINDOW)
835	windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;	836	windows = MAX_SCAN_WINDOW / scan_size;
836	floor = 1000 / windows;	837	floor = 1000 / windows;
837		838
838	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);	839	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
1164	long moveimp = imp;	1165	long moveimp = imp;
1165		1166
1166	rcu_read_lock();	1167	rcu_read_lock();
1167	cur = ACCESS_ONCE(dst_rq->curr);	1168
1168	if (cur->pid == 0) /* idle */	1169	raw_spin_lock_irq(&dst_rq->lock);
		1170	cur = dst_rq->curr;
		1171	/*
		1172	* No need to move the exiting task, and this ensures that ->curr
		1173	* wasn't reaped and thus get_task_struct() in task_numa_assign()
		1174	* is safe under RCU read lock.
		1175	* Note that rcu_read_lock() itself can't protect from the final
		1176	* put_task_struct() after the last schedule().
		1177	*/
		1178	if ((cur->flags & PF_EXITING) \|\| is_idle_task(cur))
1169	cur = NULL;	1179	cur = NULL;
		1180	raw_spin_unlock_irq(&dst_rq->lock);
1170		1181
1171	/*	1182	/*
1172	* "imp" is the fault differential for the source task between the	1183	* "imp" is the fault differential for the source task between the
@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
1520	* scanning faster if shared accesses dominate as it may	1531	* scanning faster if shared accesses dominate as it may
1521	* simply bounce migrations uselessly	1532	* simply bounce migrations uselessly
1522	*/	1533	*/
1523	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));	1534	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1524	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;	1535	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1525	}	1536	}
1526		1537


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d9fe74..15f2511a1b7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
387	.data = &sysctl_numa_balancing_scan_size,	387	.data = &sysctl_numa_balancing_scan_size,
388	.maxlen = sizeof(unsigned int),	388	.maxlen = sizeof(unsigned int),
389	.mode = 0644,	389	.mode = 0644,
390	.proc_handler = proc_dointvec,	390	.proc_handler = proc_dointvec_minmax,
		391	.extra1 = &one,
391	},	392	},
392	{	393	{
393	.procname = "numa_balancing",	394	.procname = "numa_balancing",