aboutsummaryrefslogtreecommitdiffstats
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2015-09-04 18:47:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-04 19:54:41 -0400
commitdfa37dc3fc1f6f81a6900d0e561c02362f4817f6 (patch)
treeec3267d5e11f9c8ca774e52c827e757d3a228d52 /fs/userfaultfd.c
parente6485a47b758cae04a496764a1095961ee3249e4 (diff)
userfaultfd: allow signals to interrupt a userfault
This is only simple to achieve if the userfault is going to return to userland (not to the kernel) because we can avoid returning VM_FAULT_RETRY despite we temporarily released the mmap_sem. The fault would just be retried by userland then. This is safe at least on x86 and powerpc (the two archs with the syscall implemented so far). Hint to verify for which archs this is safe: after handle_mm_fault returns, no access to data structures protected by the mmap_sem must be done by the fault code in arch/*/mm/fault.c until up_read(&mm->mmap_sem) is called. This has two main benefits: signals can run with lower latency in production (signals aren't blocked by userfaults and userfaults are immediately repeated after signal processing) and gdb can then trivially debug the threads blocked in this kind of userfaults coming directly from userland. On a side note: while gdb has a need to get signal processed, coredumps always worked perfectly with userfaults, no matter if the userfault is triggered by GUP a kernel copy_user or directly from userland. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c35
1 files changed, 32 insertions, 3 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index af88ef6fffff..a14d63e945f4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -262,7 +262,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
262 struct userfaultfd_ctx *ctx; 262 struct userfaultfd_ctx *ctx;
263 struct userfaultfd_wait_queue uwq; 263 struct userfaultfd_wait_queue uwq;
264 int ret; 264 int ret;
265 bool must_wait; 265 bool must_wait, return_to_userland;
266 266
267 BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 267 BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
268 268
@@ -327,6 +327,9 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
327 uwq.msg = userfault_msg(address, flags, reason); 327 uwq.msg = userfault_msg(address, flags, reason);
328 uwq.ctx = ctx; 328 uwq.ctx = ctx;
329 329
330 return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
331 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
332
330 spin_lock(&ctx->fault_pending_wqh.lock); 333 spin_lock(&ctx->fault_pending_wqh.lock);
331 /* 334 /*
332 * After the __add_wait_queue the uwq is visible to userland 335 * After the __add_wait_queue the uwq is visible to userland
@@ -338,14 +341,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
338 * following the spin_unlock to happen before the list_add in 341 * following the spin_unlock to happen before the list_add in
339 * __add_wait_queue. 342 * __add_wait_queue.
340 */ 343 */
341 set_current_state(TASK_KILLABLE); 344 set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
345 TASK_KILLABLE);
342 spin_unlock(&ctx->fault_pending_wqh.lock); 346 spin_unlock(&ctx->fault_pending_wqh.lock);
343 347
344 must_wait = userfaultfd_must_wait(ctx, address, flags, reason); 348 must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
345 up_read(&mm->mmap_sem); 349 up_read(&mm->mmap_sem);
346 350
347 if (likely(must_wait && !ACCESS_ONCE(ctx->released) && 351 if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
348 !fatal_signal_pending(current))) { 352 (return_to_userland ? !signal_pending(current) :
353 !fatal_signal_pending(current)))) {
349 wake_up_poll(&ctx->fd_wqh, POLLIN); 354 wake_up_poll(&ctx->fd_wqh, POLLIN);
350 schedule(); 355 schedule();
351 ret |= VM_FAULT_MAJOR; 356 ret |= VM_FAULT_MAJOR;
@@ -353,6 +358,30 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
353 358
354 __set_current_state(TASK_RUNNING); 359 __set_current_state(TASK_RUNNING);
355 360
361 if (return_to_userland) {
362 if (signal_pending(current) &&
363 !fatal_signal_pending(current)) {
364 /*
365 * If we got a SIGSTOP or SIGCONT and this is
366 * a normal userland page fault, just let
367 * userland return so the signal will be
368 * handled and gdb debugging works. The page
369 * fault code immediately after we return from
370 * this function is going to release the
371 * mmap_sem and it's not depending on it
372 * (unlike gup would if we were not to return
373 * VM_FAULT_RETRY).
374 *
375 * If a fatal signal is pending we still take
376 * the streamlined VM_FAULT_RETRY failure path
377 * and there's no need to retake the mmap_sem
378 * in such case.
379 */
380 down_read(&mm->mmap_sem);
381 ret = 0;
382 }
383 }
384
356 /* 385 /*
357 * Here we race with the list_del; list_add in 386 * Here we race with the list_del; list_add in
358 * userfaultfd_ctx_read(), however because we don't ever run 387 * userfaultfd_ctx_read(), however because we don't ever run