diff options
Diffstat (limited to 'kernel/rseq.c')
-rw-r--r-- | kernel/rseq.c | 357 |
1 files changed, 357 insertions, 0 deletions
diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..ae306f90c514 --- /dev/null +++ b/kernel/rseq.c | |||
@@ -0,0 +1,357 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0+ | ||
2 | /* | ||
3 | * Restartable sequences system call | ||
4 | * | ||
5 | * Copyright (C) 2015, Google, Inc., | ||
6 | * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> | ||
7 | * Copyright (C) 2015-2018, EfficiOS Inc., | ||
8 | * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
9 | */ | ||
10 | |||
11 | #include <linux/sched.h> | ||
12 | #include <linux/uaccess.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/rseq.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <asm/ptrace.h> | ||
17 | |||
18 | #define CREATE_TRACE_POINTS | ||
19 | #include <trace/events/rseq.h> | ||
20 | |||
21 | #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ | ||
22 | RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) | ||
23 | |||
24 | /* | ||
25 | * | ||
26 | * Restartable sequences are a lightweight interface that allows | ||
27 | * user-level code to be executed atomically relative to scheduler | ||
28 | * preemption and signal delivery. Typically used for implementing | ||
29 | * per-cpu operations. | ||
30 | * | ||
31 | * It allows user-space to perform update operations on per-cpu data | ||
32 | * without requiring heavy-weight atomic operations. | ||
33 | * | ||
34 | * Detailed algorithm of rseq user-space assembly sequences: | ||
35 | * | ||
36 | * init(rseq_cs) | ||
37 | * cpu = TLS->rseq::cpu_id_start | ||
38 | * [1] TLS->rseq::rseq_cs = rseq_cs | ||
39 | * [start_ip] ---------------------------- | ||
40 | * [2] if (cpu != TLS->rseq::cpu_id) | ||
41 | * goto abort_ip; | ||
42 | * [3] <last_instruction_in_cs> | ||
43 | * [post_commit_ip] ---------------------------- | ||
44 | * | ||
45 | * The address of jump target abort_ip must be outside the critical | ||
46 | * region, i.e.: | ||
47 | * | ||
48 | * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] | ||
49 | * | ||
50 | * Steps [2]-[3] (inclusive) need to be a sequence of instructions in | ||
51 | * userspace that can handle being interrupted between any of those | ||
52 | * instructions, and then resumed to the abort_ip. | ||
53 | * | ||
54 | * 1. Userspace stores the address of the struct rseq_cs assembly | ||
55 | * block descriptor into the rseq_cs field of the registered | ||
56 | * struct rseq TLS area. This update is performed through a single | ||
57 | * store within the inline assembly instruction sequence. | ||
58 | * [start_ip] | ||
59 | * | ||
60 | * 2. Userspace tests to check whether the current cpu_id field match | ||
61 | * the cpu number loaded before start_ip, branching to abort_ip | ||
62 | * in case of a mismatch. | ||
63 | * | ||
64 | * If the sequence is preempted or interrupted by a signal | ||
65 | * at or after start_ip and before post_commit_ip, then the kernel | ||
66 | * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return | ||
67 | * ip to abort_ip before returning to user-space, so the preempted | ||
68 | * execution resumes at abort_ip. | ||
69 | * | ||
70 | * 3. Userspace critical section final instruction before | ||
71 | * post_commit_ip is the commit. The critical section is | ||
72 | * self-terminating. | ||
73 | * [post_commit_ip] | ||
74 | * | ||
75 | * 4. <success> | ||
76 | * | ||
77 | * On failure at [2], or if interrupted by preempt or signal delivery | ||
78 | * between [1] and [3]: | ||
79 | * | ||
80 | * [abort_ip] | ||
81 | * F1. <failure> | ||
82 | */ | ||
83 | |||
84 | static int rseq_update_cpu_id(struct task_struct *t) | ||
85 | { | ||
86 | u32 cpu_id = raw_smp_processor_id(); | ||
87 | |||
88 | if (__put_user(cpu_id, &t->rseq->cpu_id_start)) | ||
89 | return -EFAULT; | ||
90 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | ||
91 | return -EFAULT; | ||
92 | trace_rseq_update(t); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int rseq_reset_rseq_cpu_id(struct task_struct *t) | ||
97 | { | ||
98 | u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; | ||
99 | |||
100 | /* | ||
101 | * Reset cpu_id_start to its initial state (0). | ||
102 | */ | ||
103 | if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) | ||
104 | return -EFAULT; | ||
105 | /* | ||
106 | * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming | ||
107 | * in after unregistration can figure out that rseq needs to be | ||
108 | * registered again. | ||
109 | */ | ||
110 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | ||
111 | return -EFAULT; | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) | ||
116 | { | ||
117 | struct rseq_cs __user *urseq_cs; | ||
118 | unsigned long ptr; | ||
119 | u32 __user *usig; | ||
120 | u32 sig; | ||
121 | int ret; | ||
122 | |||
123 | ret = __get_user(ptr, &t->rseq->rseq_cs); | ||
124 | if (ret) | ||
125 | return ret; | ||
126 | if (!ptr) { | ||
127 | memset(rseq_cs, 0, sizeof(*rseq_cs)); | ||
128 | return 0; | ||
129 | } | ||
130 | urseq_cs = (struct rseq_cs __user *)ptr; | ||
131 | if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) | ||
132 | return -EFAULT; | ||
133 | if (rseq_cs->version > 0) | ||
134 | return -EINVAL; | ||
135 | |||
136 | /* Ensure that abort_ip is not in the critical section. */ | ||
137 | if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) | ||
138 | return -EINVAL; | ||
139 | |||
140 | usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); | ||
141 | ret = get_user(sig, usig); | ||
142 | if (ret) | ||
143 | return ret; | ||
144 | |||
145 | if (current->rseq_sig != sig) { | ||
146 | printk_ratelimited(KERN_WARNING | ||
147 | "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", | ||
148 | sig, current->rseq_sig, current->pid, usig); | ||
149 | return -EPERM; | ||
150 | } | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | static int rseq_need_restart(struct task_struct *t, u32 cs_flags) | ||
155 | { | ||
156 | u32 flags, event_mask; | ||
157 | int ret; | ||
158 | |||
159 | /* Get thread flags. */ | ||
160 | ret = __get_user(flags, &t->rseq->flags); | ||
161 | if (ret) | ||
162 | return ret; | ||
163 | |||
164 | /* Take critical section flags into account. */ | ||
165 | flags |= cs_flags; | ||
166 | |||
167 | /* | ||
168 | * Restart on signal can only be inhibited when restart on | ||
169 | * preempt and restart on migrate are inhibited too. Otherwise, | ||
170 | * a preempted signal handler could fail to restart the prior | ||
171 | * execution context on sigreturn. | ||
172 | */ | ||
173 | if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && | ||
174 | (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != | ||
175 | RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | /* | ||
179 | * Load and clear event mask atomically with respect to | ||
180 | * scheduler preemption. | ||
181 | */ | ||
182 | preempt_disable(); | ||
183 | event_mask = t->rseq_event_mask; | ||
184 | t->rseq_event_mask = 0; | ||
185 | preempt_enable(); | ||
186 | |||
187 | return !!(event_mask & ~flags); | ||
188 | } | ||
189 | |||
190 | static int clear_rseq_cs(struct task_struct *t) | ||
191 | { | ||
192 | /* | ||
193 | * The rseq_cs field is set to NULL on preemption or signal | ||
194 | * delivery on top of rseq assembly block, as well as on top | ||
195 | * of code outside of the rseq assembly block. This performs | ||
196 | * a lazy clear of the rseq_cs field. | ||
197 | * | ||
198 | * Set rseq_cs to NULL with single-copy atomicity. | ||
199 | */ | ||
200 | return __put_user(0UL, &t->rseq->rseq_cs); | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Unsigned comparison will be true when ip >= start_ip, and when | ||
205 | * ip < start_ip + post_commit_offset. | ||
206 | */ | ||
207 | static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) | ||
208 | { | ||
209 | return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; | ||
210 | } | ||
211 | |||
212 | static int rseq_ip_fixup(struct pt_regs *regs) | ||
213 | { | ||
214 | unsigned long ip = instruction_pointer(regs); | ||
215 | struct task_struct *t = current; | ||
216 | struct rseq_cs rseq_cs; | ||
217 | int ret; | ||
218 | |||
219 | ret = rseq_get_rseq_cs(t, &rseq_cs); | ||
220 | if (ret) | ||
221 | return ret; | ||
222 | |||
223 | /* | ||
224 | * Handle potentially not being within a critical section. | ||
225 | * If not nested over a rseq critical section, restart is useless. | ||
226 | * Clear the rseq_cs pointer and return. | ||
227 | */ | ||
228 | if (!in_rseq_cs(ip, &rseq_cs)) | ||
229 | return clear_rseq_cs(t); | ||
230 | ret = rseq_need_restart(t, rseq_cs.flags); | ||
231 | if (ret <= 0) | ||
232 | return ret; | ||
233 | ret = clear_rseq_cs(t); | ||
234 | if (ret) | ||
235 | return ret; | ||
236 | trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, | ||
237 | rseq_cs.abort_ip); | ||
238 | instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * This resume handler must always be executed between any of: | ||
244 | * - preemption, | ||
245 | * - signal delivery, | ||
246 | * and return to user-space. | ||
247 | * | ||
248 | * This is how we can ensure that the entire rseq critical section, | ||
249 | * consisting of both the C part and the assembly instruction sequence, | ||
250 | * will issue the commit instruction only if executed atomically with | ||
251 | * respect to other threads scheduled on the same CPU, and with respect | ||
252 | * to signal handlers. | ||
253 | */ | ||
254 | void __rseq_handle_notify_resume(struct pt_regs *regs) | ||
255 | { | ||
256 | struct task_struct *t = current; | ||
257 | int ret; | ||
258 | |||
259 | if (unlikely(t->flags & PF_EXITING)) | ||
260 | return; | ||
261 | if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) | ||
262 | goto error; | ||
263 | ret = rseq_ip_fixup(regs); | ||
264 | if (unlikely(ret < 0)) | ||
265 | goto error; | ||
266 | if (unlikely(rseq_update_cpu_id(t))) | ||
267 | goto error; | ||
268 | return; | ||
269 | |||
270 | error: | ||
271 | force_sig(SIGSEGV, t); | ||
272 | } | ||
273 | |||
274 | #ifdef CONFIG_DEBUG_RSEQ | ||
275 | |||
276 | /* | ||
277 | * Terminate the process if a syscall is issued within a restartable | ||
278 | * sequence. | ||
279 | */ | ||
280 | void rseq_syscall(struct pt_regs *regs) | ||
281 | { | ||
282 | unsigned long ip = instruction_pointer(regs); | ||
283 | struct task_struct *t = current; | ||
284 | struct rseq_cs rseq_cs; | ||
285 | |||
286 | if (!t->rseq) | ||
287 | return; | ||
288 | if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || | ||
289 | rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) | ||
290 | force_sig(SIGSEGV, t); | ||
291 | } | ||
292 | |||
293 | #endif | ||
294 | |||
295 | /* | ||
296 | * sys_rseq - setup restartable sequences for caller thread. | ||
297 | */ | ||
298 | SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | ||
299 | int, flags, u32, sig) | ||
300 | { | ||
301 | int ret; | ||
302 | |||
303 | if (flags & RSEQ_FLAG_UNREGISTER) { | ||
304 | /* Unregister rseq for current thread. */ | ||
305 | if (current->rseq != rseq || !current->rseq) | ||
306 | return -EINVAL; | ||
307 | if (current->rseq_len != rseq_len) | ||
308 | return -EINVAL; | ||
309 | if (current->rseq_sig != sig) | ||
310 | return -EPERM; | ||
311 | ret = rseq_reset_rseq_cpu_id(current); | ||
312 | if (ret) | ||
313 | return ret; | ||
314 | current->rseq = NULL; | ||
315 | current->rseq_len = 0; | ||
316 | current->rseq_sig = 0; | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | if (unlikely(flags)) | ||
321 | return -EINVAL; | ||
322 | |||
323 | if (current->rseq) { | ||
324 | /* | ||
325 | * If rseq is already registered, check whether | ||
326 | * the provided address differs from the prior | ||
327 | * one. | ||
328 | */ | ||
329 | if (current->rseq != rseq || current->rseq_len != rseq_len) | ||
330 | return -EINVAL; | ||
331 | if (current->rseq_sig != sig) | ||
332 | return -EPERM; | ||
333 | /* Already registered. */ | ||
334 | return -EBUSY; | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * If there was no rseq previously registered, | ||
339 | * ensure the provided rseq is properly aligned and valid. | ||
340 | */ | ||
341 | if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || | ||
342 | rseq_len != sizeof(*rseq)) | ||
343 | return -EINVAL; | ||
344 | if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) | ||
345 | return -EFAULT; | ||
346 | current->rseq = rseq; | ||
347 | current->rseq_len = rseq_len; | ||
348 | current->rseq_sig = sig; | ||
349 | /* | ||
350 | * If rseq was previously inactive, and has just been | ||
351 | * registered, ensure the cpu_id_start and cpu_id fields | ||
352 | * are updated before returning to user-space. | ||
353 | */ | ||
354 | rseq_set_notify_resume(current); | ||
355 | |||
356 | return 0; | ||
357 | } | ||