sched/membarrier: Fix p->mm->membarrier_state racy load

The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Kirill Tkhai <tkhai@yandex.ru> Cc: Mike Galbraith <efault@gmx.de> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King - ARM Linux admin <linux@armlinux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/20190919173705.2181-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 2019-09-19 13:37:02 -0400
committer: Ingo Molnar <mingo@kernel.org> 2019-09-25 11:42:30 -0400
commit: 227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce (patch)
tree: 38eb00e930d17973a6f2e751e6ff87cac3acb6c5 /kernel/sched
parent: 2840cf02fae627860156737e83326df354ee4ec6 (diff)
3 files changed, 168 insertions, 45 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84c71160beb1..2d9a3947bef4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
                else
                        prev->active_mm = NULL;
        } else {                                        // to user
+                membarrier_switch_mm(rq, prev->active_mm, next->mm);
                /*
                 * sys_membarrier() requires an smp_mb() between setting
-                 * rq->curr and returning to userspace.
+                 * rq->curr / membarrier_switch_mm() and returning to userspace.
                 *
                 * The below provides this either through switch_mm(), or in
                 * case 'prev->active_mm == next->mm' through
                 * finish_task_switch()'s mmdrop().
                 */
                switch_mm_irqs_off(prev->active_mm, next->mm, next);
                if (!prev->mm) {                        // from kernel
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 7ccbd0e19626..070cf433bb9a 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -30,6 +30,39 @@ static void ipi_mb(void *info)
        smp_mb();       /* IPIs should be serializing but paranoid. */
 }
+static void ipi_sync_rq_state(void *info)
+{
+        struct mm_struct *mm = (struct mm_struct *) info;
+        if (current->mm != mm)
+                return;
+        this_cpu_write(runqueues.membarrier_state,
+                       atomic_read(&mm->membarrier_state));
+        /*
+         * Issue a memory barrier after setting
+         * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
+         * guarantee that no memory access following registration is reordered
+         * before registration.
+         */
+        smp_mb();
+}
+void membarrier_exec_mmap(struct mm_struct *mm)
+{
+        /*
+         * Issue a memory barrier before clearing membarrier_state to
+         * guarantee that no memory access prior to exec is reordered after
+         * clearing this state.
+         */
+        smp_mb();
+        atomic_set(&mm->membarrier_state, 0);
+        /*
+         * Keep the runqueue membarrier_state in sync with this mm
+         * membarrier_state.
+         */
+        this_cpu_write(runqueues.membarrier_state, 0);
+}
 static int membarrier_global_expedited(void)
 {
        int cpu;
@@ -56,6 +89,7 @@ static int membarrier_global_expedited(void)
        }
        cpus_read_lock();
+        rcu_read_lock();
        for_each_online_cpu(cpu) {
                struct task_struct *p;
@@ -70,17 +104,25 @@ static int membarrier_global_expedited(void)
                if (cpu == raw_smp_processor_id())
                        continue;
-                rcu_read_lock();
+                if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+                    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
+                        continue;
+                /*
+                 * Skip the CPU if it runs a kernel thread. The scheduler
+                 * leaves the prior task mm in place as an optimization when
+                 * scheduling a kthread.
+                 */
                p = rcu_dereference(cpu_rq(cpu)->curr);
-                if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+                if (p->flags & PF_KTHREAD)
-                                   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+                        continue;
-                        if (!fallback)
-                                __cpumask_set_cpu(cpu, tmpmask);
+                if (!fallback)
-                        else
+                        __cpumask_set_cpu(cpu, tmpmask);
-                                smp_call_function_single(cpu, ipi_mb, NULL, 1);
+                else
-                }
+                        smp_call_function_single(cpu, ipi_mb, NULL, 1);
-                rcu_read_unlock();
        }
+        rcu_read_unlock();
        if (!fallback) {
                preempt_disable();
                smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags)
        }
        cpus_read_lock();
+        rcu_read_lock();
        for_each_online_cpu(cpu) {
                struct task_struct *p;
@@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags)
                        else
                                smp_call_function_single(cpu, ipi_mb, NULL, 1);
                }
-                rcu_read_unlock();
        }
+        rcu_read_unlock();
        if (!fallback) {
                preempt_disable();
                smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags)
        return 0;
 }
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+        int membarrier_state = atomic_read(&mm->membarrier_state);
+        cpumask_var_t tmpmask;
+        int cpu;
+        if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+                this_cpu_write(runqueues.membarrier_state, membarrier_state);
+                /*
+                 * For single mm user, we can simply issue a memory barrier
+                 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+                 * mm and in the current runqueue to guarantee that no memory
+                 * access following registration is reordered before
+                 * registration.
+                 */
+                smp_mb();
+                return 0;
+        }
+        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+                return -ENOMEM;
+        /*
+         * For mm with multiple users, we need to ensure all future
+         * scheduler executions will observe @mm's new membarrier
+         * state.
+         */
+        synchronize_rcu();
+        /*
+         * For each cpu runqueue, if the task's mm match @mm, ensure that all
+         * @mm's membarrier state set bits are also set in in the runqueue's
+         * membarrier state. This ensures that a runqueue scheduling
+         * between threads which are users of @mm has its membarrier state
+         * updated.
+         */
+        cpus_read_lock();
+        rcu_read_lock();
+        for_each_online_cpu(cpu) {
+                struct rq *rq = cpu_rq(cpu);
+                struct task_struct *p;
+                p = rcu_dereference(&rq->curr);
+                if (p && p->mm == mm)
+                        __cpumask_set_cpu(cpu, tmpmask);
+        }
+        rcu_read_unlock();
+        preempt_disable();
+        smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+        preempt_enable();
+        free_cpumask_var(tmpmask);
+        cpus_read_unlock();
+        return 0;
+}
 static int membarrier_register_global_expedited(void)
 {
        struct task_struct *p = current;
        struct mm_struct *mm = p->mm;
+        int ret;
        if (atomic_read(&mm->membarrier_state) &
            MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
                return 0;
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
-        if (atomic_read(&mm->mm_users) == 1) {
+        ret = sync_runqueues_membarrier_state(mm);
-                /*
+        if (ret)
-                 * For single mm user, single threaded process, we can
+                return ret;
-                 * simply issue a memory barrier after setting
-                 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
-                 * no memory access following registration is reordered
-                 * before registration.
-                 */
-                smp_mb();
-        } else {
-                /*
-                 * For multi-mm user threads, we need to ensure all
-                 * future scheduler executions will observe the new
-                 * thread flag state for this mm.
-                 */
-                synchronize_rcu();
-        }
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
                  &mm->membarrier_state);
@@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags)
 {
        struct task_struct *p = current;
        struct mm_struct *mm = p->mm;
-        int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+        int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+            set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
+            ret;
        if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
                if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                        return -EINVAL;
-                state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+                ready_state =
+                        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
        }
        /*
@@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags)
         * groups, which use the same mm. (CLONE_VM but not
         * CLONE_THREAD).
         */
-        if ((atomic_read(&mm->membarrier_state) & state) == state)
+        if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
                return 0;
-        atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
        if (flags & MEMBARRIER_FLAG_SYNC_CORE)
-                atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+                set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
-                          &mm->membarrier_state);
+        atomic_or(set_state, &mm->membarrier_state);
-        if (atomic_read(&mm->mm_users) != 1) {
+        ret = sync_runqueues_membarrier_state(mm);
-                /*
+        if (ret)
-                 * Ensure all future scheduler executions will observe the
+                return ret;
-                 * new thread flag state for this process.
+        atomic_or(ready_state, &mm->membarrier_state);
-                 */
-                synchronize_rcu();
-        }
-        atomic_or(state, &mm->membarrier_state);
        return 0;
 }
@@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags)
 * command specified does not exist, not available on the running
 * kernel, or if the command argument is invalid, this system call
 * returns -EINVAL. For a given command, with flags argument set to 0,
- * this system call is guaranteed to always return the same value until
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
- * reboot.
+ * always return the same value until reboot. In addition, it can return
+ * -ENOMEM if there is not enough memory available to perform the system
+ * call.
 *
 * All memory accesses performed in program order from each targeted thread
 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3cb895d14a2..0db2c1b3361e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -911,6 +911,10 @@ struct rq {
        atomic_t                nr_iowait;
+#ifdef CONFIG_MEMBARRIER
+        int membarrier_state;
+#endif
 #ifdef CONFIG_SMP
        struct root_domain              *rd;
        struct sched_domain __rcu       *sd;
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void)
 static inline bool sched_energy_enabled(void) { return false; }
 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+                                        struct mm_struct *prev_mm,
+                                        struct mm_struct *next_mm)
+{
+        int membarrier_state;
+        if (prev_mm == next_mm)
+                return;
+        membarrier_state = atomic_read(&next_mm->membarrier_state);
+        if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+                return;
+        WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_switch_mm(struct rq *rq,
+                                        struct mm_struct *prev_mm,
+                                        struct mm_struct *next_mm)
+{
+}
+#endif
author	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>	2019-09-19 13:37:02 -0400
committer	Ingo Molnar <mingo@kernel.org>	2019-09-25 11:42:30 -0400
commit	227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce (patch)
tree	38eb00e930d17973a6f2e751e6ff87cac3acb6c5 /kernel/sched
parent	2840cf02fae627860156737e83326df354ee4ec6 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84c71160beb1..2d9a3947bef4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -3358,15 +3358,15 @@ context_switch(struct rq rq, struct task_struct prev,
3358	else	3358	else
3359	prev->active_mm = NULL;	3359	prev->active_mm = NULL;
3360	} else { // to user	3360	} else { // to user
		3361	membarrier_switch_mm(rq, prev->active_mm, next->mm);
3361	/*	3362	/*
3362	* sys_membarrier() requires an smp_mb() between setting	3363	* sys_membarrier() requires an smp_mb() between setting
3363	* rq->curr and returning to userspace.	3364	* rq->curr / membarrier_switch_mm() and returning to userspace.
3364	*	3365	*
3365	* The below provides this either through switch_mm(), or in	3366	* The below provides this either through switch_mm(), or in
3366	* case 'prev->active_mm == next->mm' through	3367	* case 'prev->active_mm == next->mm' through
3367	* finish_task_switch()'s mmdrop().	3368	* finish_task_switch()'s mmdrop().
3368	*/	3369	*/
3369
3370	switch_mm_irqs_off(prev->active_mm, next->mm, next);	3370	switch_mm_irqs_off(prev->active_mm, next->mm, next);
3371		3371
3372	if (!prev->mm) { // from kernel	3372	if (!prev->mm) { // from kernel


diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7ccbd0e19626..070cf433bb9a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c
@@ -30,6 +30,39 @@ static void ipi_mb(void *info)
30	smp_mb(); /* IPIs should be serializing but paranoid. */	30	smp_mb(); /* IPIs should be serializing but paranoid. */
31	}	31	}
32		32
		33	static void ipi_sync_rq_state(void *info)
		34	{
		35	struct mm_struct mm = (struct mm_struct ) info;
		36
		37	if (current->mm != mm)
		38	return;
		39	this_cpu_write(runqueues.membarrier_state,
		40	atomic_read(&mm->membarrier_state));
		41	/*
		42	* Issue a memory barrier after setting
		43	* MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
		44	* guarantee that no memory access following registration is reordered
		45	* before registration.
		46	*/
		47	smp_mb();
		48	}
		49
		50	void membarrier_exec_mmap(struct mm_struct *mm)
		51	{
		52	/*
		53	* Issue a memory barrier before clearing membarrier_state to
		54	* guarantee that no memory access prior to exec is reordered after
		55	* clearing this state.
		56	*/
		57	smp_mb();
		58	atomic_set(&mm->membarrier_state, 0);
		59	/*
		60	* Keep the runqueue membarrier_state in sync with this mm
		61	* membarrier_state.
		62	*/
		63	this_cpu_write(runqueues.membarrier_state, 0);
		64	}
		65
33	static int membarrier_global_expedited(void)	66	static int membarrier_global_expedited(void)
34	{	67	{
35	int cpu;	68	int cpu;
@@ -56,6 +89,7 @@ static int membarrier_global_expedited(void)
56	}	89	}
57		90
58	cpus_read_lock();	91	cpus_read_lock();
		92	rcu_read_lock();
59	for_each_online_cpu(cpu) {	93	for_each_online_cpu(cpu) {
60	struct task_struct *p;	94	struct task_struct *p;
61		95
@@ -70,17 +104,25 @@ static int membarrier_global_expedited(void)
70	if (cpu == raw_smp_processor_id())	104	if (cpu == raw_smp_processor_id())
71	continue;	105	continue;
72		106
73	rcu_read_lock();	107	if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
		108	MEMBARRIER_STATE_GLOBAL_EXPEDITED))
		109	continue;
		110
		111	/*
		112	* Skip the CPU if it runs a kernel thread. The scheduler
		113	* leaves the prior task mm in place as an optimization when
		114	* scheduling a kthread.
		115	*/
74	p = rcu_dereference(cpu_rq(cpu)->curr);	116	p = rcu_dereference(cpu_rq(cpu)->curr);
75	if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &	117	if (p->flags & PF_KTHREAD)
76	MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {	118	continue;
77	if (!fallback)	119
78	__cpumask_set_cpu(cpu, tmpmask);	120	if (!fallback)
79	else	121	__cpumask_set_cpu(cpu, tmpmask);
80	smp_call_function_single(cpu, ipi_mb, NULL, 1);	122	else
81	}	123	smp_call_function_single(cpu, ipi_mb, NULL, 1);
82	rcu_read_unlock();
83	}	124	}
		125	rcu_read_unlock();
84	if (!fallback) {	126	if (!fallback) {
85	preempt_disable();	127	preempt_disable();
86	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);	128	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags)
136	}	178	}
137		179
138	cpus_read_lock();	180	cpus_read_lock();
		181	rcu_read_lock();
139	for_each_online_cpu(cpu) {	182	for_each_online_cpu(cpu) {
140	struct task_struct *p;	183	struct task_struct *p;
141		184
@@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags)
157	else	200	else
158	smp_call_function_single(cpu, ipi_mb, NULL, 1);	201	smp_call_function_single(cpu, ipi_mb, NULL, 1);
159	}	202	}
160	rcu_read_unlock();
161	}	203	}
		204	rcu_read_unlock();
162	if (!fallback) {	205	if (!fallback) {
163	preempt_disable();	206	preempt_disable();
164	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);	207	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags)
177	return 0;	220	return 0;
178	}	221	}
179		222
		223	static int sync_runqueues_membarrier_state(struct mm_struct *mm)
		224	{
		225	int membarrier_state = atomic_read(&mm->membarrier_state);
		226	cpumask_var_t tmpmask;
		227	int cpu;
		228
		229	if (atomic_read(&mm->mm_users) == 1 \|\| num_online_cpus() == 1) {
		230	this_cpu_write(runqueues.membarrier_state, membarrier_state);
		231
		232	/*
		233	* For single mm user, we can simply issue a memory barrier
		234	* after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
		235	* mm and in the current runqueue to guarantee that no memory
		236	* access following registration is reordered before
		237	* registration.
		238	*/
		239	smp_mb();
		240	return 0;
		241	}
		242
		243	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
		244	return -ENOMEM;
		245
		246	/*
		247	* For mm with multiple users, we need to ensure all future
		248	* scheduler executions will observe @mm's new membarrier
		249	* state.
		250	*/
		251	synchronize_rcu();
		252
		253	/*
		254	* For each cpu runqueue, if the task's mm match @mm, ensure that all
		255	* @mm's membarrier state set bits are also set in in the runqueue's
		256	* membarrier state. This ensures that a runqueue scheduling
		257	* between threads which are users of @mm has its membarrier state
		258	* updated.
		259	*/
		260	cpus_read_lock();
		261	rcu_read_lock();
		262	for_each_online_cpu(cpu) {
		263	struct rq *rq = cpu_rq(cpu);
		264	struct task_struct *p;
		265
		266	p = rcu_dereference(&rq->curr);
		267	if (p && p->mm == mm)
		268	__cpumask_set_cpu(cpu, tmpmask);
		269	}
		270	rcu_read_unlock();
		271
		272	preempt_disable();
		273	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
		274	preempt_enable();
		275
		276	free_cpumask_var(tmpmask);
		277	cpus_read_unlock();
		278
		279	return 0;
		280	}
		281
180	static int membarrier_register_global_expedited(void)	282	static int membarrier_register_global_expedited(void)
181	{	283	{
182	struct task_struct *p = current;	284	struct task_struct *p = current;
183	struct mm_struct *mm = p->mm;	285	struct mm_struct *mm = p->mm;
		286	int ret;
184		287
185	if (atomic_read(&mm->membarrier_state) &	288	if (atomic_read(&mm->membarrier_state) &
186	MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)	289	MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
187	return 0;	290	return 0;
188	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);	291	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
189	if (atomic_read(&mm->mm_users) == 1) {	292	ret = sync_runqueues_membarrier_state(mm);
190	/*	293	if (ret)
191	* For single mm user, single threaded process, we can	294	return ret;
192	* simply issue a memory barrier after setting
193	* MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
194	* no memory access following registration is reordered
195	* before registration.
196	*/
197	smp_mb();
198	} else {
199	/*
200	* For multi-mm user threads, we need to ensure all
201	* future scheduler executions will observe the new
202	* thread flag state for this mm.
203	*/
204	synchronize_rcu();
205	}
206	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,	295	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
207	&mm->membarrier_state);	296	&mm->membarrier_state);
208		297
@@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags)
213	{	302	{
214	struct task_struct *p = current;	303	struct task_struct *p = current;
215	struct mm_struct *mm = p->mm;	304	struct mm_struct *mm = p->mm;
216	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;	305	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
		306	set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
		307	ret;
217		308
218	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {	309	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
219	if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))	310	if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
220	return -EINVAL;	311	return -EINVAL;
221	state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;	312	ready_state =
		313	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
222	}	314	}
223		315
224	/*	316	/*
@@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags)
226	* groups, which use the same mm. (CLONE_VM but not	318	* groups, which use the same mm. (CLONE_VM but not
227	* CLONE_THREAD).	319	* CLONE_THREAD).
228	*/	320	*/
229	if ((atomic_read(&mm->membarrier_state) & state) == state)	321	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
230	return 0;	322	return 0;
231	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
232	if (flags & MEMBARRIER_FLAG_SYNC_CORE)	323	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
233	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,	324	set_state \|= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
234	&mm->membarrier_state);	325	atomic_or(set_state, &mm->membarrier_state);
235	if (atomic_read(&mm->mm_users) != 1) {	326	ret = sync_runqueues_membarrier_state(mm);
236	/*	327	if (ret)
237	* Ensure all future scheduler executions will observe the	328	return ret;
238	* new thread flag state for this process.	329	atomic_or(ready_state, &mm->membarrier_state);
239	*/
240	synchronize_rcu();
241	}
242	atomic_or(state, &mm->membarrier_state);
243		330
244	return 0;	331	return 0;
245	}	332	}
@@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags)
253	* command specified does not exist, not available on the running	340	* command specified does not exist, not available on the running
254	* kernel, or if the command argument is invalid, this system call	341	* kernel, or if the command argument is invalid, this system call
255	* returns -EINVAL. For a given command, with flags argument set to 0,	342	* returns -EINVAL. For a given command, with flags argument set to 0,
256	* this system call is guaranteed to always return the same value until	343	* if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
257	* reboot.	344	* always return the same value until reboot. In addition, it can return
		345	* -ENOMEM if there is not enough memory available to perform the system
		346	* call.
258	*	347	*
259	* All memory accesses performed in program order from each targeted thread	348	* All memory accesses performed in program order from each targeted thread
260	* is guaranteed to be ordered with respect to sys_membarrier(). If we use	349	* is guaranteed to be ordered with respect to sys_membarrier(). If we use


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -911,6 +911,10 @@ struct rq {
911		911
912	atomic_t nr_iowait;	912	atomic_t nr_iowait;
913		913
		914	#ifdef CONFIG_MEMBARRIER
		915	int membarrier_state;
		916	#endif
		917
914	#ifdef CONFIG_SMP	918	#ifdef CONFIG_SMP
915	struct root_domain *rd;	919	struct root_domain *rd;
916	struct sched_domain __rcu *sd;	920	struct sched_domain __rcu *sd;
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void)
2438	static inline bool sched_energy_enabled(void) { return false; }	2442	static inline bool sched_energy_enabled(void) { return false; }
2439		2443
2440	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */	2444	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
		2445
		2446	#ifdef CONFIG_MEMBARRIER
		2447	/*
		2448	* The scheduler provides memory barriers required by membarrier between:
		2449	* - prior user-space memory accesses and store to rq->membarrier_state,
		2450	* - store to rq->membarrier_state and following user-space memory accesses.
		2451	* In the same way it provides those guarantees around store to rq->curr.
		2452	*/
		2453	static inline void membarrier_switch_mm(struct rq *rq,
		2454	struct mm_struct *prev_mm,
		2455	struct mm_struct *next_mm)
		2456	{
		2457	int membarrier_state;
		2458
		2459	if (prev_mm == next_mm)
		2460	return;
		2461
		2462	membarrier_state = atomic_read(&next_mm->membarrier_state);
		2463	if (READ_ONCE(rq->membarrier_state) == membarrier_state)
		2464	return;
		2465
		2466	WRITE_ONCE(rq->membarrier_state, membarrier_state);
		2467	}
		2468	#else
		2469	static inline void membarrier_switch_mm(struct rq *rq,
		2470	struct mm_struct *prev_mm,
		2471	struct mm_struct *next_mm)
		2472	{
		2473	}
		2474	#endif