diff options
author | Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | 2018-01-29 15:20:13 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-02-05 15:34:31 -0500 |
commit | c5f58bd58f432be5d92df33c5458e0bcbee3aadf (patch) | |
tree | 0a7c6d59b6101cd22de8a7da86b75010c84c199f | |
parent | 306e060435d7a3aef8f6f033e43b0f581638adce (diff) |
membarrier: Provide GLOBAL_EXPEDITED command
Allow expedited membarrier to be used for data shared between processes
through shared memory.
Processes wishing to receive the membarriers register with
MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. Those which want to issue
membarrier invoke MEMBARRIER_CMD_GLOBAL_EXPEDITED.
This allows extremely simple kernel-level implementation: we have almost
everything we need with the PRIVATE_EXPEDITED barrier code. All we need
to do is to add a flag in the mm_struct that will be used to check
whether we need to send the IPI to the current thread of each CPU.
There is a slight downside to this approach compared to targeting
specific shared memory users: when performing a membarrier operation,
all registered "global" receivers will get the barrier, even if they
don't share a memory mapping with the sender issuing
MEMBARRIER_CMD_GLOBAL_EXPEDITED.
This registration approach seems to fit the requirement of not
disturbing processes that really deeply care about real-time: they
simply should not register with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
In order to align the membarrier command names, the "MEMBARRIER_CMD_SHARED"
command is renamed to "MEMBARRIER_CMD_GLOBAL", keeping an alias of
MEMBARRIER_CMD_SHARED to MEMBARRIER_CMD_GLOBAL for UAPI header backward
compatibility.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: David Sehr <sehr@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maged Michael <maged.michael@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20180129202020.8515-5-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/powerpc/include/asm/membarrier.h | 3 | ||||
-rw-r--r-- | include/linux/sched/mm.h | 6 | ||||
-rw-r--r-- | include/uapi/linux/membarrier.h | 42 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 120 |
4 files changed, 153 insertions, 18 deletions
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h index 98ff4f1fcf2b..6e20bb5c74ea 100644 --- a/arch/powerpc/include/asm/membarrier.h +++ b/arch/powerpc/include/asm/membarrier.h | |||
@@ -13,7 +13,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, | |||
13 | * store to rq->curr. | 13 | * store to rq->curr. |
14 | */ | 14 | */ |
15 | if (likely(!(atomic_read(&next->membarrier_state) & | 15 | if (likely(!(atomic_read(&next->membarrier_state) & |
16 | MEMBARRIER_STATE_PRIVATE_EXPEDITED) || !prev)) | 16 | (MEMBARRIER_STATE_PRIVATE_EXPEDITED | |
17 | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) | ||
17 | return; | 18 | return; |
18 | 19 | ||
19 | /* | 20 | /* |
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index b84e0fde1d72..1c4e40c5efaf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h | |||
@@ -219,8 +219,10 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) | |||
219 | 219 | ||
220 | #ifdef CONFIG_MEMBARRIER | 220 | #ifdef CONFIG_MEMBARRIER |
221 | enum { | 221 | enum { |
222 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), | 222 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), |
223 | MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), | 223 | MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), |
224 | MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), | ||
225 | MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), | ||
224 | }; | 226 | }; |
225 | 227 | ||
226 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS | 228 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS |
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 4e01ad7ffe98..d252506e1b5e 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h | |||
@@ -31,7 +31,7 @@ | |||
31 | * enum membarrier_cmd - membarrier system call command | 31 | * enum membarrier_cmd - membarrier system call command |
32 | * @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns | 32 | * @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns |
33 | * a bitmask of valid commands. | 33 | * a bitmask of valid commands. |
34 | * @MEMBARRIER_CMD_SHARED: Execute a memory barrier on all running threads. | 34 | * @MEMBARRIER_CMD_GLOBAL: Execute a memory barrier on all running threads. |
35 | * Upon return from system call, the caller thread | 35 | * Upon return from system call, the caller thread |
36 | * is ensured that all running threads have passed | 36 | * is ensured that all running threads have passed |
37 | * through a state where all memory accesses to | 37 | * through a state where all memory accesses to |
@@ -40,6 +40,28 @@ | |||
40 | * (non-running threads are de facto in such a | 40 | * (non-running threads are de facto in such a |
41 | * state). This covers threads from all processes | 41 | * state). This covers threads from all processes |
42 | * running on the system. This command returns 0. | 42 | * running on the system. This command returns 0. |
43 | * @MEMBARRIER_CMD_GLOBAL_EXPEDITED: | ||
44 | * Execute a memory barrier on all running threads | ||
45 | * of all processes which previously registered | ||
46 | * with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. | ||
47 | * Upon return from system call, the caller thread | ||
48 | * is ensured that all running threads have passed | ||
49 | * through a state where all memory accesses to | ||
50 | * user-space addresses match program order between | ||
51 | * entry to and return from the system call | ||
52 | * (non-running threads are de facto in such a | ||
53 | * state). This only covers threads from processes | ||
54 | * which registered with | ||
55 | * MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. | ||
56 | * This command returns 0. Given that | ||
57 | * registration is about the intent to receive | ||
58 | * the barriers, it is valid to invoke | ||
59 | * MEMBARRIER_CMD_GLOBAL_EXPEDITED from a | ||
60 | * non-registered process. | ||
61 | * @MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: | ||
62 | * Register the process intent to receive | ||
63 | * MEMBARRIER_CMD_GLOBAL_EXPEDITED memory | ||
64 | * barriers. Always returns 0. | ||
43 | * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: | 65 | * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: |
44 | * Execute a memory barrier on each running | 66 | * Execute a memory barrier on each running |
45 | * thread belonging to the same process as the current | 67 | * thread belonging to the same process as the current |
@@ -64,18 +86,24 @@ | |||
64 | * Register the process intent to use | 86 | * Register the process intent to use |
65 | * MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always | 87 | * MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always |
66 | * returns 0. | 88 | * returns 0. |
89 | * @MEMBARRIER_CMD_SHARED: | ||
90 | * Alias to MEMBARRIER_CMD_GLOBAL. Provided for | ||
91 | * header backward compatibility. | ||
67 | * | 92 | * |
68 | * Command to be passed to the membarrier system call. The commands need to | 93 | * Command to be passed to the membarrier system call. The commands need to |
69 | * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to | 94 | * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to |
70 | * the value 0. | 95 | * the value 0. |
71 | */ | 96 | */ |
72 | enum membarrier_cmd { | 97 | enum membarrier_cmd { |
73 | MEMBARRIER_CMD_QUERY = 0, | 98 | MEMBARRIER_CMD_QUERY = 0, |
74 | MEMBARRIER_CMD_SHARED = (1 << 0), | 99 | MEMBARRIER_CMD_GLOBAL = (1 << 0), |
75 | /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ | 100 | MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1), |
76 | /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ | 101 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2), |
77 | MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), | 102 | MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), |
78 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), | 103 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), |
104 | |||
105 | /* Alias for header backward compatibility. */ | ||
106 | MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, | ||
79 | }; | 107 | }; |
80 | 108 | ||
81 | #endif /* _UAPI_LINUX_MEMBARRIER_H */ | 109 | #endif /* _UAPI_LINUX_MEMBARRIER_H */ |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 678577267a9a..d2087d5f9837 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -27,7 +27,9 @@ | |||
27 | * except MEMBARRIER_CMD_QUERY. | 27 | * except MEMBARRIER_CMD_QUERY. |
28 | */ | 28 | */ |
29 | #define MEMBARRIER_CMD_BITMASK \ | 29 | #define MEMBARRIER_CMD_BITMASK \ |
30 | (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | 30 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ |
31 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | ||
32 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | ||
31 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) | 33 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) |
32 | 34 | ||
33 | static void ipi_mb(void *info) | 35 | static void ipi_mb(void *info) |
@@ -35,6 +37,73 @@ static void ipi_mb(void *info) | |||
35 | smp_mb(); /* IPIs should be serializing but paranoid. */ | 37 | smp_mb(); /* IPIs should be serializing but paranoid. */ |
36 | } | 38 | } |
37 | 39 | ||
40 | static int membarrier_global_expedited(void) | ||
41 | { | ||
42 | int cpu; | ||
43 | bool fallback = false; | ||
44 | cpumask_var_t tmpmask; | ||
45 | |||
46 | if (num_online_cpus() == 1) | ||
47 | return 0; | ||
48 | |||
49 | /* | ||
50 | * Matches memory barriers around rq->curr modification in | ||
51 | * scheduler. | ||
52 | */ | ||
53 | smp_mb(); /* system call entry is not a mb. */ | ||
54 | |||
55 | /* | ||
56 | * Expedited membarrier commands guarantee that they won't | ||
57 | * block, hence the GFP_NOWAIT allocation flag and fallback | ||
58 | * implementation. | ||
59 | */ | ||
60 | if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { | ||
61 | /* Fallback for OOM. */ | ||
62 | fallback = true; | ||
63 | } | ||
64 | |||
65 | cpus_read_lock(); | ||
66 | for_each_online_cpu(cpu) { | ||
67 | struct task_struct *p; | ||
68 | |||
69 | /* | ||
70 | * Skipping the current CPU is OK even through we can be | ||
71 | * migrated at any point. The current CPU, at the point | ||
72 | * where we read raw_smp_processor_id(), is ensured to | ||
73 | * be in program order with respect to the caller | ||
74 | * thread. Therefore, we can skip this CPU from the | ||
75 | * iteration. | ||
76 | */ | ||
77 | if (cpu == raw_smp_processor_id()) | ||
78 | continue; | ||
79 | rcu_read_lock(); | ||
80 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | ||
81 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & | ||
82 | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { | ||
83 | if (!fallback) | ||
84 | __cpumask_set_cpu(cpu, tmpmask); | ||
85 | else | ||
86 | smp_call_function_single(cpu, ipi_mb, NULL, 1); | ||
87 | } | ||
88 | rcu_read_unlock(); | ||
89 | } | ||
90 | if (!fallback) { | ||
91 | preempt_disable(); | ||
92 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); | ||
93 | preempt_enable(); | ||
94 | free_cpumask_var(tmpmask); | ||
95 | } | ||
96 | cpus_read_unlock(); | ||
97 | |||
98 | /* | ||
99 | * Memory barrier on the caller thread _after_ we finished | ||
100 | * waiting for the last IPI. Matches memory barriers around | ||
101 | * rq->curr modification in scheduler. | ||
102 | */ | ||
103 | smp_mb(); /* exit from system call is not a mb */ | ||
104 | return 0; | ||
105 | } | ||
106 | |||
38 | static int membarrier_private_expedited(void) | 107 | static int membarrier_private_expedited(void) |
39 | { | 108 | { |
40 | int cpu; | 109 | int cpu; |
@@ -105,7 +174,38 @@ static int membarrier_private_expedited(void) | |||
105 | return 0; | 174 | return 0; |
106 | } | 175 | } |
107 | 176 | ||
108 | static void membarrier_register_private_expedited(void) | 177 | static int membarrier_register_global_expedited(void) |
178 | { | ||
179 | struct task_struct *p = current; | ||
180 | struct mm_struct *mm = p->mm; | ||
181 | |||
182 | if (atomic_read(&mm->membarrier_state) & | ||
183 | MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) | ||
184 | return 0; | ||
185 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); | ||
186 | if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { | ||
187 | /* | ||
188 | * For single mm user, single threaded process, we can | ||
189 | * simply issue a memory barrier after setting | ||
190 | * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that | ||
191 | * no memory access following registration is reordered | ||
192 | * before registration. | ||
193 | */ | ||
194 | smp_mb(); | ||
195 | } else { | ||
196 | /* | ||
197 | * For multi-mm user threads, we need to ensure all | ||
198 | * future scheduler executions will observe the new | ||
199 | * thread flag state for this mm. | ||
200 | */ | ||
201 | synchronize_sched(); | ||
202 | } | ||
203 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, | ||
204 | &mm->membarrier_state); | ||
205 | return 0; | ||
206 | } | ||
207 | |||
208 | static int membarrier_register_private_expedited(void) | ||
109 | { | 209 | { |
110 | struct task_struct *p = current; | 210 | struct task_struct *p = current; |
111 | struct mm_struct *mm = p->mm; | 211 | struct mm_struct *mm = p->mm; |
@@ -117,7 +217,7 @@ static void membarrier_register_private_expedited(void) | |||
117 | */ | 217 | */ |
118 | if (atomic_read(&mm->membarrier_state) | 218 | if (atomic_read(&mm->membarrier_state) |
119 | & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) | 219 | & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) |
120 | return; | 220 | return 0; |
121 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); | 221 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); |
122 | if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { | 222 | if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { |
123 | /* | 223 | /* |
@@ -128,6 +228,7 @@ static void membarrier_register_private_expedited(void) | |||
128 | } | 228 | } |
129 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, | 229 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, |
130 | &mm->membarrier_state); | 230 | &mm->membarrier_state); |
231 | return 0; | ||
131 | } | 232 | } |
132 | 233 | ||
133 | /** | 234 | /** |
@@ -167,21 +268,24 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) | |||
167 | int cmd_mask = MEMBARRIER_CMD_BITMASK; | 268 | int cmd_mask = MEMBARRIER_CMD_BITMASK; |
168 | 269 | ||
169 | if (tick_nohz_full_enabled()) | 270 | if (tick_nohz_full_enabled()) |
170 | cmd_mask &= ~MEMBARRIER_CMD_SHARED; | 271 | cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; |
171 | return cmd_mask; | 272 | return cmd_mask; |
172 | } | 273 | } |
173 | case MEMBARRIER_CMD_SHARED: | 274 | case MEMBARRIER_CMD_GLOBAL: |
174 | /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ | 275 | /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ |
175 | if (tick_nohz_full_enabled()) | 276 | if (tick_nohz_full_enabled()) |
176 | return -EINVAL; | 277 | return -EINVAL; |
177 | if (num_online_cpus() > 1) | 278 | if (num_online_cpus() > 1) |
178 | synchronize_sched(); | 279 | synchronize_sched(); |
179 | return 0; | 280 | return 0; |
281 | case MEMBARRIER_CMD_GLOBAL_EXPEDITED: | ||
282 | return membarrier_global_expedited(); | ||
283 | case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: | ||
284 | return membarrier_register_global_expedited(); | ||
180 | case MEMBARRIER_CMD_PRIVATE_EXPEDITED: | 285 | case MEMBARRIER_CMD_PRIVATE_EXPEDITED: |
181 | return membarrier_private_expedited(); | 286 | return membarrier_private_expedited(); |
182 | case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: | 287 | case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: |
183 | membarrier_register_private_expedited(); | 288 | return membarrier_register_private_expedited(); |
184 | return 0; | ||
185 | default: | 289 | default: |
186 | return -EINVAL; | 290 | return -EINVAL; |
187 | } | 291 | } |