diff options
author | Alex Williamson <alex.williamson@redhat.com> | 2018-05-11 11:05:02 -0400 |
---|---|---|
committer | Alex Williamson <alex.williamson@redhat.com> | 2018-06-08 12:21:39 -0400 |
commit | 48d8476b41eed63567dd2f0ad125c895b9ac648a (patch) | |
tree | 02a6fd308233b40ba26b05234c7e05dcd847d961 | |
parent | 29dcea88779c856c7dc92040a0c01233263101d4 (diff) |
vfio/type1: Fix task tracking for QEMU vCPU hotplug
MAP_DMA ioctls might be called from various threads within a process,
for example when using QEMU, the vCPU threads are often generating
these calls and we therefore take a reference to that vCPU task.
However, QEMU also supports vCPU hotplug on some machines and the task
that called MAP_DMA may have exited by the time UNMAP_DMA is called,
resulting in the mm_struct pointer being NULL and thus a failure to
match against the existing mapping.
To resolve this, we instead take a reference to the thread
group_leader, which has the same mm_struct and resource limits, but
is less likely exit, at least in the QEMU case. A difficulty here is
guaranteeing that the capabilities of the group_leader match that of
the calling thread, which we resolve by tracking CAP_IPC_LOCK at the
time of calling rather than at an indeterminate time in the future.
Potentially this also results in better efficiency as this is now
recorded once per MAP_DMA ioctl.
Reported-by: Xu Yandong <xuyandong2@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 73 |
1 files changed, 47 insertions, 26 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3c082451ab1a..2c75b33db4ac 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -83,6 +83,7 @@ struct vfio_dma { | |||
83 | size_t size; /* Map size (bytes) */ | 83 | size_t size; /* Map size (bytes) */ |
84 | int prot; /* IOMMU_READ/WRITE */ | 84 | int prot; /* IOMMU_READ/WRITE */ |
85 | bool iommu_mapped; | 85 | bool iommu_mapped; |
86 | bool lock_cap; /* capable(CAP_IPC_LOCK) */ | ||
86 | struct task_struct *task; | 87 | struct task_struct *task; |
87 | struct rb_root pfn_list; /* Ex-user pinned pfn list */ | 88 | struct rb_root pfn_list; /* Ex-user pinned pfn list */ |
88 | }; | 89 | }; |
@@ -253,29 +254,25 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) | |||
253 | return ret; | 254 | return ret; |
254 | } | 255 | } |
255 | 256 | ||
256 | static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap) | 257 | static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) |
257 | { | 258 | { |
258 | struct mm_struct *mm; | 259 | struct mm_struct *mm; |
259 | bool is_current; | ||
260 | int ret; | 260 | int ret; |
261 | 261 | ||
262 | if (!npage) | 262 | if (!npage) |
263 | return 0; | 263 | return 0; |
264 | 264 | ||
265 | is_current = (task->mm == current->mm); | 265 | mm = async ? get_task_mm(dma->task) : dma->task->mm; |
266 | |||
267 | mm = is_current ? task->mm : get_task_mm(task); | ||
268 | if (!mm) | 266 | if (!mm) |
269 | return -ESRCH; /* process exited */ | 267 | return -ESRCH; /* process exited */ |
270 | 268 | ||
271 | ret = down_write_killable(&mm->mmap_sem); | 269 | ret = down_write_killable(&mm->mmap_sem); |
272 | if (!ret) { | 270 | if (!ret) { |
273 | if (npage > 0) { | 271 | if (npage > 0) { |
274 | if (lock_cap ? !*lock_cap : | 272 | if (!dma->lock_cap) { |
275 | !has_capability(task, CAP_IPC_LOCK)) { | ||
276 | unsigned long limit; | 273 | unsigned long limit; |
277 | 274 | ||
278 | limit = task_rlimit(task, | 275 | limit = task_rlimit(dma->task, |
279 | RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 276 | RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
280 | 277 | ||
281 | if (mm->locked_vm + npage > limit) | 278 | if (mm->locked_vm + npage > limit) |
@@ -289,7 +286,7 @@ static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap) | |||
289 | up_write(&mm->mmap_sem); | 286 | up_write(&mm->mmap_sem); |
290 | } | 287 | } |
291 | 288 | ||
292 | if (!is_current) | 289 | if (async) |
293 | mmput(mm); | 290 | mmput(mm); |
294 | 291 | ||
295 | return ret; | 292 | return ret; |
@@ -400,7 +397,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, | |||
400 | */ | 397 | */ |
401 | static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, | 398 | static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, |
402 | long npage, unsigned long *pfn_base, | 399 | long npage, unsigned long *pfn_base, |
403 | bool lock_cap, unsigned long limit) | 400 | unsigned long limit) |
404 | { | 401 | { |
405 | unsigned long pfn = 0; | 402 | unsigned long pfn = 0; |
406 | long ret, pinned = 0, lock_acct = 0; | 403 | long ret, pinned = 0, lock_acct = 0; |
@@ -423,7 +420,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, | |||
423 | * pages are already counted against the user. | 420 | * pages are already counted against the user. |
424 | */ | 421 | */ |
425 | if (!rsvd && !vfio_find_vpfn(dma, iova)) { | 422 | if (!rsvd && !vfio_find_vpfn(dma, iova)) { |
426 | if (!lock_cap && current->mm->locked_vm + 1 > limit) { | 423 | if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) { |
427 | put_pfn(*pfn_base, dma->prot); | 424 | put_pfn(*pfn_base, dma->prot); |
428 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, | 425 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, |
429 | limit << PAGE_SHIFT); | 426 | limit << PAGE_SHIFT); |
@@ -449,7 +446,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, | |||
449 | } | 446 | } |
450 | 447 | ||
451 | if (!rsvd && !vfio_find_vpfn(dma, iova)) { | 448 | if (!rsvd && !vfio_find_vpfn(dma, iova)) { |
452 | if (!lock_cap && | 449 | if (!dma->lock_cap && |
453 | current->mm->locked_vm + lock_acct + 1 > limit) { | 450 | current->mm->locked_vm + lock_acct + 1 > limit) { |
454 | put_pfn(pfn, dma->prot); | 451 | put_pfn(pfn, dma->prot); |
455 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | 452 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", |
@@ -462,7 +459,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, | |||
462 | } | 459 | } |
463 | 460 | ||
464 | out: | 461 | out: |
465 | ret = vfio_lock_acct(current, lock_acct, &lock_cap); | 462 | ret = vfio_lock_acct(dma, lock_acct, false); |
466 | 463 | ||
467 | unpin_out: | 464 | unpin_out: |
468 | if (ret) { | 465 | if (ret) { |
@@ -493,7 +490,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, | |||
493 | } | 490 | } |
494 | 491 | ||
495 | if (do_accounting) | 492 | if (do_accounting) |
496 | vfio_lock_acct(dma->task, locked - unlocked, NULL); | 493 | vfio_lock_acct(dma, locked - unlocked, true); |
497 | 494 | ||
498 | return unlocked; | 495 | return unlocked; |
499 | } | 496 | } |
@@ -510,7 +507,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, | |||
510 | 507 | ||
511 | ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); | 508 | ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); |
512 | if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { | 509 | if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { |
513 | ret = vfio_lock_acct(dma->task, 1, NULL); | 510 | ret = vfio_lock_acct(dma, 1, true); |
514 | if (ret) { | 511 | if (ret) { |
515 | put_pfn(*pfn_base, dma->prot); | 512 | put_pfn(*pfn_base, dma->prot); |
516 | if (ret == -ENOMEM) | 513 | if (ret == -ENOMEM) |
@@ -537,7 +534,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, | |||
537 | unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); | 534 | unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); |
538 | 535 | ||
539 | if (do_accounting) | 536 | if (do_accounting) |
540 | vfio_lock_acct(dma->task, -unlocked, NULL); | 537 | vfio_lock_acct(dma, -unlocked, true); |
541 | 538 | ||
542 | return unlocked; | 539 | return unlocked; |
543 | } | 540 | } |
@@ -829,7 +826,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, | |||
829 | unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list); | 826 | unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list); |
830 | 827 | ||
831 | if (do_accounting) { | 828 | if (do_accounting) { |
832 | vfio_lock_acct(dma->task, -unlocked, NULL); | 829 | vfio_lock_acct(dma, -unlocked, true); |
833 | return 0; | 830 | return 0; |
834 | } | 831 | } |
835 | return unlocked; | 832 | return unlocked; |
@@ -1044,14 +1041,12 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, | |||
1044 | size_t size = map_size; | 1041 | size_t size = map_size; |
1045 | long npage; | 1042 | long npage; |
1046 | unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 1043 | unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
1047 | bool lock_cap = capable(CAP_IPC_LOCK); | ||
1048 | int ret = 0; | 1044 | int ret = 0; |
1049 | 1045 | ||
1050 | while (size) { | 1046 | while (size) { |
1051 | /* Pin a contiguous chunk of memory */ | 1047 | /* Pin a contiguous chunk of memory */ |
1052 | npage = vfio_pin_pages_remote(dma, vaddr + dma->size, | 1048 | npage = vfio_pin_pages_remote(dma, vaddr + dma->size, |
1053 | size >> PAGE_SHIFT, &pfn, | 1049 | size >> PAGE_SHIFT, &pfn, limit); |
1054 | lock_cap, limit); | ||
1055 | if (npage <= 0) { | 1050 | if (npage <= 0) { |
1056 | WARN_ON(!npage); | 1051 | WARN_ON(!npage); |
1057 | ret = (int)npage; | 1052 | ret = (int)npage; |
@@ -1126,8 +1121,36 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, | |||
1126 | dma->iova = iova; | 1121 | dma->iova = iova; |
1127 | dma->vaddr = vaddr; | 1122 | dma->vaddr = vaddr; |
1128 | dma->prot = prot; | 1123 | dma->prot = prot; |
1129 | get_task_struct(current); | 1124 | |
1130 | dma->task = current; | 1125 | /* |
1126 | * We need to be able to both add to a task's locked memory and test | ||
1127 | * against the locked memory limit and we need to be able to do both | ||
1128 | * outside of this call path as pinning can be asynchronous via the | ||
1129 | * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a | ||
1130 | * task_struct and VM locked pages requires an mm_struct, however | ||
1131 | * holding an indefinite mm reference is not recommended, therefore we | ||
1132 | * only hold a reference to a task. We could hold a reference to | ||
1133 | * current, however QEMU uses this call path through vCPU threads, | ||
1134 | * which can be killed resulting in a NULL mm and failure in the unmap | ||
1135 | * path when called via a different thread. Avoid this problem by | ||
1136 | * using the group_leader as threads within the same group require | ||
1137 | * both CLONE_THREAD and CLONE_VM and will therefore use the same | ||
1138 | * mm_struct. | ||
1139 | * | ||
1140 | * Previously we also used the task for testing CAP_IPC_LOCK at the | ||
1141 | * time of pinning and accounting, however has_capability() makes use | ||
1142 | * of real_cred, a copy-on-write field, so we can't guarantee that it | ||
1143 | * matches group_leader, or in fact that it might not change by the | ||
1144 | * time it's evaluated. If a process were to call MAP_DMA with | ||
1145 | * CAP_IPC_LOCK but later drop it, it doesn't make sense that they | ||
1146 | * possibly see different results for an iommu_mapped vfio_dma vs | ||
1147 | * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the | ||
1148 | * time of calling MAP_DMA. | ||
1149 | */ | ||
1150 | get_task_struct(current->group_leader); | ||
1151 | dma->task = current->group_leader; | ||
1152 | dma->lock_cap = capable(CAP_IPC_LOCK); | ||
1153 | |||
1131 | dma->pfn_list = RB_ROOT; | 1154 | dma->pfn_list = RB_ROOT; |
1132 | 1155 | ||
1133 | /* Insert zero-sized and grow as we map chunks of it */ | 1156 | /* Insert zero-sized and grow as we map chunks of it */ |
@@ -1162,7 +1185,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, | |||
1162 | struct vfio_domain *d; | 1185 | struct vfio_domain *d; |
1163 | struct rb_node *n; | 1186 | struct rb_node *n; |
1164 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 1187 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
1165 | bool lock_cap = capable(CAP_IPC_LOCK); | ||
1166 | int ret; | 1188 | int ret; |
1167 | 1189 | ||
1168 | /* Arbitrarily pick the first domain in the list for lookups */ | 1190 | /* Arbitrarily pick the first domain in the list for lookups */ |
@@ -1209,8 +1231,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, | |||
1209 | 1231 | ||
1210 | npage = vfio_pin_pages_remote(dma, vaddr, | 1232 | npage = vfio_pin_pages_remote(dma, vaddr, |
1211 | n >> PAGE_SHIFT, | 1233 | n >> PAGE_SHIFT, |
1212 | &pfn, lock_cap, | 1234 | &pfn, limit); |
1213 | limit); | ||
1214 | if (npage <= 0) { | 1235 | if (npage <= 0) { |
1215 | WARN_ON(!npage); | 1236 | WARN_ON(!npage); |
1216 | ret = (int)npage; | 1237 | ret = (int)npage; |
@@ -1487,7 +1508,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) | |||
1487 | if (!is_invalid_reserved_pfn(vpfn->pfn)) | 1508 | if (!is_invalid_reserved_pfn(vpfn->pfn)) |
1488 | locked++; | 1509 | locked++; |
1489 | } | 1510 | } |
1490 | vfio_lock_acct(dma->task, locked - unlocked, NULL); | 1511 | vfio_lock_acct(dma, locked - unlocked, true); |
1491 | } | 1512 | } |
1492 | } | 1513 | } |
1493 | 1514 | ||