diff options
| -rw-r--r-- | fs/userfaultfd.c | 42 |
1 files changed, 26 insertions, 16 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ae0b8b5f69e6..ccbdbd62f0d8 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
| @@ -40,6 +40,16 @@ enum userfaultfd_state { | |||
| 40 | /* | 40 | /* |
| 41 | * Start with fault_pending_wqh and fault_wqh so they're more likely | 41 | * Start with fault_pending_wqh and fault_wqh so they're more likely |
| 42 | * to be in the same cacheline. | 42 | * to be in the same cacheline. |
| 43 | * | ||
| 44 | * Locking order: | ||
| 45 | * fd_wqh.lock | ||
| 46 | * fault_pending_wqh.lock | ||
| 47 | * fault_wqh.lock | ||
| 48 | * event_wqh.lock | ||
| 49 | * | ||
| 50 | * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, | ||
| 51 | * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's | ||
| 52 | * also taken in IRQ context. | ||
| 43 | */ | 53 | */ |
| 44 | struct userfaultfd_ctx { | 54 | struct userfaultfd_ctx { |
| 45 | /* waitqueue head for the pending (i.e. not read) userfaults */ | 55 | /* waitqueue head for the pending (i.e. not read) userfaults */ |
| @@ -458,7 +468,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) | |||
| 458 | blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : | 468 | blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : |
| 459 | TASK_KILLABLE; | 469 | TASK_KILLABLE; |
| 460 | 470 | ||
| 461 | spin_lock(&ctx->fault_pending_wqh.lock); | 471 | spin_lock_irq(&ctx->fault_pending_wqh.lock); |
| 462 | /* | 472 | /* |
| 463 | * After the __add_wait_queue the uwq is visible to userland | 473 | * After the __add_wait_queue the uwq is visible to userland |
| 464 | * through poll/read(). | 474 | * through poll/read(). |
| @@ -470,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) | |||
| 470 | * __add_wait_queue. | 480 | * __add_wait_queue. |
| 471 | */ | 481 | */ |
| 472 | set_current_state(blocking_state); | 482 | set_current_state(blocking_state); |
| 473 | spin_unlock(&ctx->fault_pending_wqh.lock); | 483 | spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
| 474 | 484 | ||
| 475 | if (!is_vm_hugetlb_page(vmf->vma)) | 485 | if (!is_vm_hugetlb_page(vmf->vma)) |
| 476 | must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, | 486 | must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, |
| @@ -552,13 +562,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) | |||
| 552 | * kernel stack can be released after the list_del_init. | 562 | * kernel stack can be released after the list_del_init. |
| 553 | */ | 563 | */ |
| 554 | if (!list_empty_careful(&uwq.wq.entry)) { | 564 | if (!list_empty_careful(&uwq.wq.entry)) { |
| 555 | spin_lock(&ctx->fault_pending_wqh.lock); | 565 | spin_lock_irq(&ctx->fault_pending_wqh.lock); |
| 556 | /* | 566 | /* |
| 557 | * No need of list_del_init(), the uwq on the stack | 567 | * No need of list_del_init(), the uwq on the stack |
| 558 | * will be freed shortly anyway. | 568 | * will be freed shortly anyway. |
| 559 | */ | 569 | */ |
| 560 | list_del(&uwq.wq.entry); | 570 | list_del(&uwq.wq.entry); |
| 561 | spin_unlock(&ctx->fault_pending_wqh.lock); | 571 | spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
| 562 | } | 572 | } |
| 563 | 573 | ||
| 564 | /* | 574 | /* |
| @@ -583,7 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, | |||
| 583 | init_waitqueue_entry(&ewq->wq, current); | 593 | init_waitqueue_entry(&ewq->wq, current); |
| 584 | release_new_ctx = NULL; | 594 | release_new_ctx = NULL; |
| 585 | 595 | ||
| 586 | spin_lock(&ctx->event_wqh.lock); | 596 | spin_lock_irq(&ctx->event_wqh.lock); |
| 587 | /* | 597 | /* |
| 588 | * After the __add_wait_queue the uwq is visible to userland | 598 | * After the __add_wait_queue the uwq is visible to userland |
| 589 | * through poll/read(). | 599 | * through poll/read(). |
| @@ -613,15 +623,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, | |||
| 613 | break; | 623 | break; |
| 614 | } | 624 | } |
| 615 | 625 | ||
| 616 | spin_unlock(&ctx->event_wqh.lock); | 626 | spin_unlock_irq(&ctx->event_wqh.lock); |
| 617 | 627 | ||
| 618 | wake_up_poll(&ctx->fd_wqh, EPOLLIN); | 628 | wake_up_poll(&ctx->fd_wqh, EPOLLIN); |
| 619 | schedule(); | 629 | schedule(); |
| 620 | 630 | ||
| 621 | spin_lock(&ctx->event_wqh.lock); | 631 | spin_lock_irq(&ctx->event_wqh.lock); |
| 622 | } | 632 | } |
| 623 | __set_current_state(TASK_RUNNING); | 633 | __set_current_state(TASK_RUNNING); |
| 624 | spin_unlock(&ctx->event_wqh.lock); | 634 | spin_unlock_irq(&ctx->event_wqh.lock); |
| 625 | 635 | ||
| 626 | if (release_new_ctx) { | 636 | if (release_new_ctx) { |
| 627 | struct vm_area_struct *vma; | 637 | struct vm_area_struct *vma; |
| @@ -918,10 +928,10 @@ wakeup: | |||
| 918 | * the last page faults that may have been already waiting on | 928 | * the last page faults that may have been already waiting on |
| 919 | * the fault_*wqh. | 929 | * the fault_*wqh. |
| 920 | */ | 930 | */ |
| 921 | spin_lock(&ctx->fault_pending_wqh.lock); | 931 | spin_lock_irq(&ctx->fault_pending_wqh.lock); |
| 922 | __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); | 932 | __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); |
| 923 | __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); | 933 | __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); |
| 924 | spin_unlock(&ctx->fault_pending_wqh.lock); | 934 | spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
| 925 | 935 | ||
| 926 | /* Flush pending events that may still wait on event_wqh */ | 936 | /* Flush pending events that may still wait on event_wqh */ |
| 927 | wake_up_all(&ctx->event_wqh); | 937 | wake_up_all(&ctx->event_wqh); |
| @@ -1134,7 +1144,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, | |||
| 1134 | 1144 | ||
| 1135 | if (!ret && msg->event == UFFD_EVENT_FORK) { | 1145 | if (!ret && msg->event == UFFD_EVENT_FORK) { |
| 1136 | ret = resolve_userfault_fork(ctx, fork_nctx, msg); | 1146 | ret = resolve_userfault_fork(ctx, fork_nctx, msg); |
| 1137 | spin_lock(&ctx->event_wqh.lock); | 1147 | spin_lock_irq(&ctx->event_wqh.lock); |
| 1138 | if (!list_empty(&fork_event)) { | 1148 | if (!list_empty(&fork_event)) { |
| 1139 | /* | 1149 | /* |
| 1140 | * The fork thread didn't abort, so we can | 1150 | * The fork thread didn't abort, so we can |
| @@ -1180,7 +1190,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, | |||
| 1180 | if (ret) | 1190 | if (ret) |
| 1181 | userfaultfd_ctx_put(fork_nctx); | 1191 | userfaultfd_ctx_put(fork_nctx); |
| 1182 | } | 1192 | } |
| 1183 | spin_unlock(&ctx->event_wqh.lock); | 1193 | spin_unlock_irq(&ctx->event_wqh.lock); |
| 1184 | } | 1194 | } |
| 1185 | 1195 | ||
| 1186 | return ret; | 1196 | return ret; |
| @@ -1219,14 +1229,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, | |||
| 1219 | static void __wake_userfault(struct userfaultfd_ctx *ctx, | 1229 | static void __wake_userfault(struct userfaultfd_ctx *ctx, |
| 1220 | struct userfaultfd_wake_range *range) | 1230 | struct userfaultfd_wake_range *range) |
| 1221 | { | 1231 | { |
| 1222 | spin_lock(&ctx->fault_pending_wqh.lock); | 1232 | spin_lock_irq(&ctx->fault_pending_wqh.lock); |
| 1223 | /* wake all in the range and autoremove */ | 1233 | /* wake all in the range and autoremove */ |
| 1224 | if (waitqueue_active(&ctx->fault_pending_wqh)) | 1234 | if (waitqueue_active(&ctx->fault_pending_wqh)) |
| 1225 | __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, | 1235 | __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, |
| 1226 | range); | 1236 | range); |
| 1227 | if (waitqueue_active(&ctx->fault_wqh)) | 1237 | if (waitqueue_active(&ctx->fault_wqh)) |
| 1228 | __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); | 1238 | __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); |
| 1229 | spin_unlock(&ctx->fault_pending_wqh.lock); | 1239 | spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
| 1230 | } | 1240 | } |
| 1231 | 1241 | ||
| 1232 | static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, | 1242 | static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, |
| @@ -1881,7 +1891,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) | |||
| 1881 | wait_queue_entry_t *wq; | 1891 | wait_queue_entry_t *wq; |
| 1882 | unsigned long pending = 0, total = 0; | 1892 | unsigned long pending = 0, total = 0; |
| 1883 | 1893 | ||
| 1884 | spin_lock(&ctx->fault_pending_wqh.lock); | 1894 | spin_lock_irq(&ctx->fault_pending_wqh.lock); |
| 1885 | list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { | 1895 | list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { |
| 1886 | pending++; | 1896 | pending++; |
| 1887 | total++; | 1897 | total++; |
| @@ -1889,7 +1899,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) | |||
| 1889 | list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { | 1899 | list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { |
| 1890 | total++; | 1900 | total++; |
| 1891 | } | 1901 | } |
| 1892 | spin_unlock(&ctx->fault_pending_wqh.lock); | 1902 | spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
| 1893 | 1903 | ||
| 1894 | /* | 1904 | /* |
| 1895 | * If more protocols will be added, there will be all shown | 1905 | * If more protocols will be added, there will be all shown |
