aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/userfaultfd.c42
1 files changed, 26 insertions, 16 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ae0b8b5f69e6..ccbdbd62f0d8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -40,6 +40,16 @@ enum userfaultfd_state {
40/* 40/*
41 * Start with fault_pending_wqh and fault_wqh so they're more likely 41 * Start with fault_pending_wqh and fault_wqh so they're more likely
42 * to be in the same cacheline. 42 * to be in the same cacheline.
43 *
44 * Locking order:
45 * fd_wqh.lock
46 * fault_pending_wqh.lock
47 * fault_wqh.lock
48 * event_wqh.lock
49 *
50 * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
51 * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
52 * also taken in IRQ context.
43 */ 53 */
44struct userfaultfd_ctx { 54struct userfaultfd_ctx {
45 /* waitqueue head for the pending (i.e. not read) userfaults */ 55 /* waitqueue head for the pending (i.e. not read) userfaults */
@@ -458,7 +468,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
458 blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : 468 blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
459 TASK_KILLABLE; 469 TASK_KILLABLE;
460 470
461 spin_lock(&ctx->fault_pending_wqh.lock); 471 spin_lock_irq(&ctx->fault_pending_wqh.lock);
462 /* 472 /*
463 * After the __add_wait_queue the uwq is visible to userland 473 * After the __add_wait_queue the uwq is visible to userland
464 * through poll/read(). 474 * through poll/read().
@@ -470,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
470 * __add_wait_queue. 480 * __add_wait_queue.
471 */ 481 */
472 set_current_state(blocking_state); 482 set_current_state(blocking_state);
473 spin_unlock(&ctx->fault_pending_wqh.lock); 483 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
474 484
475 if (!is_vm_hugetlb_page(vmf->vma)) 485 if (!is_vm_hugetlb_page(vmf->vma))
476 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, 486 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
@@ -552,13 +562,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
552 * kernel stack can be released after the list_del_init. 562 * kernel stack can be released after the list_del_init.
553 */ 563 */
554 if (!list_empty_careful(&uwq.wq.entry)) { 564 if (!list_empty_careful(&uwq.wq.entry)) {
555 spin_lock(&ctx->fault_pending_wqh.lock); 565 spin_lock_irq(&ctx->fault_pending_wqh.lock);
556 /* 566 /*
557 * No need of list_del_init(), the uwq on the stack 567 * No need of list_del_init(), the uwq on the stack
558 * will be freed shortly anyway. 568 * will be freed shortly anyway.
559 */ 569 */
560 list_del(&uwq.wq.entry); 570 list_del(&uwq.wq.entry);
561 spin_unlock(&ctx->fault_pending_wqh.lock); 571 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
562 } 572 }
563 573
564 /* 574 /*
@@ -583,7 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
583 init_waitqueue_entry(&ewq->wq, current); 593 init_waitqueue_entry(&ewq->wq, current);
584 release_new_ctx = NULL; 594 release_new_ctx = NULL;
585 595
586 spin_lock(&ctx->event_wqh.lock); 596 spin_lock_irq(&ctx->event_wqh.lock);
587 /* 597 /*
588 * After the __add_wait_queue the uwq is visible to userland 598 * After the __add_wait_queue the uwq is visible to userland
589 * through poll/read(). 599 * through poll/read().
@@ -613,15 +623,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
613 break; 623 break;
614 } 624 }
615 625
616 spin_unlock(&ctx->event_wqh.lock); 626 spin_unlock_irq(&ctx->event_wqh.lock);
617 627
618 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 628 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
619 schedule(); 629 schedule();
620 630
621 spin_lock(&ctx->event_wqh.lock); 631 spin_lock_irq(&ctx->event_wqh.lock);
622 } 632 }
623 __set_current_state(TASK_RUNNING); 633 __set_current_state(TASK_RUNNING);
624 spin_unlock(&ctx->event_wqh.lock); 634 spin_unlock_irq(&ctx->event_wqh.lock);
625 635
626 if (release_new_ctx) { 636 if (release_new_ctx) {
627 struct vm_area_struct *vma; 637 struct vm_area_struct *vma;
@@ -918,10 +928,10 @@ wakeup:
918 * the last page faults that may have been already waiting on 928 * the last page faults that may have been already waiting on
919 * the fault_*wqh. 929 * the fault_*wqh.
920 */ 930 */
921 spin_lock(&ctx->fault_pending_wqh.lock); 931 spin_lock_irq(&ctx->fault_pending_wqh.lock);
922 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); 932 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
923 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); 933 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
924 spin_unlock(&ctx->fault_pending_wqh.lock); 934 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
925 935
926 /* Flush pending events that may still wait on event_wqh */ 936 /* Flush pending events that may still wait on event_wqh */
927 wake_up_all(&ctx->event_wqh); 937 wake_up_all(&ctx->event_wqh);
@@ -1134,7 +1144,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1134 1144
1135 if (!ret && msg->event == UFFD_EVENT_FORK) { 1145 if (!ret && msg->event == UFFD_EVENT_FORK) {
1136 ret = resolve_userfault_fork(ctx, fork_nctx, msg); 1146 ret = resolve_userfault_fork(ctx, fork_nctx, msg);
1137 spin_lock(&ctx->event_wqh.lock); 1147 spin_lock_irq(&ctx->event_wqh.lock);
1138 if (!list_empty(&fork_event)) { 1148 if (!list_empty(&fork_event)) {
1139 /* 1149 /*
1140 * The fork thread didn't abort, so we can 1150 * The fork thread didn't abort, so we can
@@ -1180,7 +1190,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1180 if (ret) 1190 if (ret)
1181 userfaultfd_ctx_put(fork_nctx); 1191 userfaultfd_ctx_put(fork_nctx);
1182 } 1192 }
1183 spin_unlock(&ctx->event_wqh.lock); 1193 spin_unlock_irq(&ctx->event_wqh.lock);
1184 } 1194 }
1185 1195
1186 return ret; 1196 return ret;
@@ -1219,14 +1229,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1219static void __wake_userfault(struct userfaultfd_ctx *ctx, 1229static void __wake_userfault(struct userfaultfd_ctx *ctx,
1220 struct userfaultfd_wake_range *range) 1230 struct userfaultfd_wake_range *range)
1221{ 1231{
1222 spin_lock(&ctx->fault_pending_wqh.lock); 1232 spin_lock_irq(&ctx->fault_pending_wqh.lock);
1223 /* wake all in the range and autoremove */ 1233 /* wake all in the range and autoremove */
1224 if (waitqueue_active(&ctx->fault_pending_wqh)) 1234 if (waitqueue_active(&ctx->fault_pending_wqh))
1225 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 1235 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1226 range); 1236 range);
1227 if (waitqueue_active(&ctx->fault_wqh)) 1237 if (waitqueue_active(&ctx->fault_wqh))
1228 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); 1238 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1229 spin_unlock(&ctx->fault_pending_wqh.lock); 1239 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1230} 1240}
1231 1241
1232static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 1242static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
@@ -1881,7 +1891,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
1881 wait_queue_entry_t *wq; 1891 wait_queue_entry_t *wq;
1882 unsigned long pending = 0, total = 0; 1892 unsigned long pending = 0, total = 0;
1883 1893
1884 spin_lock(&ctx->fault_pending_wqh.lock); 1894 spin_lock_irq(&ctx->fault_pending_wqh.lock);
1885 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { 1895 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
1886 pending++; 1896 pending++;
1887 total++; 1897 total++;
@@ -1889,7 +1899,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
1889 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { 1899 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
1890 total++; 1900 total++;
1891 } 1901 }
1892 spin_unlock(&ctx->fault_pending_wqh.lock); 1902 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1893 1903
1894 /* 1904 /*
1895 * If more protocols will be added, there will be all shown 1905 * If more protocols will be added, there will be all shown