diff options
-rw-r--r-- | fs/hugetlbfs/inode.c | 61 | ||||
-rw-r--r-- | mm/hugetlb.c | 21 |
2 files changed, 38 insertions, 44 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32920a10100e..a2fcea5f8225 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) | |||
383 | * truncation is indicated by end of range being LLONG_MAX | 383 | * truncation is indicated by end of range being LLONG_MAX |
384 | * In this case, we first scan the range and release found pages. | 384 | * In this case, we first scan the range and release found pages. |
385 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv | 385 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv |
386 | * maps and global counts. Page faults can not race with truncation | 386 | * maps and global counts. |
387 | * in this routine. hugetlb_no_page() prevents page faults in the | ||
388 | * truncated range. It checks i_size before allocation, and again after | ||
389 | * with the page table lock for the page held. The same lock must be | ||
390 | * acquired to unmap a page. | ||
391 | * hole punch is indicated if end is not LLONG_MAX | 387 | * hole punch is indicated if end is not LLONG_MAX |
392 | * In the hole punch case we scan the range and release found pages. | 388 | * In the hole punch case we scan the range and release found pages. |
393 | * Only when releasing a page is the associated region/reserv map | 389 | * Only when releasing a page is the associated region/reserv map |
394 | * deleted. The region/reserv map for ranges without associated | 390 | * deleted. The region/reserv map for ranges without associated |
395 | * pages are not modified. Page faults can race with hole punch. | 391 | * pages are not modified. |
396 | * This is indicated if we find a mapped page. | 392 | * |
393 | * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent | ||
394 | * races with page faults. | ||
395 | * | ||
397 | * Note: If the passed end of range value is beyond the end of file, but | 396 | * Note: If the passed end of range value is beyond the end of file, but |
398 | * not LLONG_MAX this routine still performs a hole punch operation. | 397 | * not LLONG_MAX this routine still performs a hole punch operation. |
399 | */ | 398 | */ |
@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
423 | 422 | ||
424 | for (i = 0; i < pagevec_count(&pvec); ++i) { | 423 | for (i = 0; i < pagevec_count(&pvec); ++i) { |
425 | struct page *page = pvec.pages[i]; | 424 | struct page *page = pvec.pages[i]; |
426 | u32 hash; | ||
427 | 425 | ||
428 | index = page->index; | 426 | index = page->index; |
429 | hash = hugetlb_fault_mutex_hash(h, current->mm, | ||
430 | &pseudo_vma, | ||
431 | mapping, index, 0); | ||
432 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | ||
433 | |||
434 | /* | 427 | /* |
435 | * If page is mapped, it was faulted in after being | 428 | * A mapped page is impossible as callers should unmap |
436 | * unmapped in caller. Unmap (again) now after taking | 429 | * all references before calling. And, i_mmap_rwsem |
437 | * the fault mutex. The mutex will prevent faults | 430 | * prevents the creation of additional mappings. |
438 | * until we finish removing the page. | ||
439 | * | ||
440 | * This race can only happen in the hole punch case. | ||
441 | * Getting here in a truncate operation is a bug. | ||
442 | */ | 431 | */ |
443 | if (unlikely(page_mapped(page))) { | 432 | VM_BUG_ON(page_mapped(page)); |
444 | BUG_ON(truncate_op); | ||
445 | |||
446 | i_mmap_lock_write(mapping); | ||
447 | hugetlb_vmdelete_list(&mapping->i_mmap, | ||
448 | index * pages_per_huge_page(h), | ||
449 | (index + 1) * pages_per_huge_page(h)); | ||
450 | i_mmap_unlock_write(mapping); | ||
451 | } | ||
452 | 433 | ||
453 | lock_page(page); | 434 | lock_page(page); |
454 | /* | 435 | /* |
@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
470 | } | 451 | } |
471 | 452 | ||
472 | unlock_page(page); | 453 | unlock_page(page); |
473 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | ||
474 | } | 454 | } |
475 | huge_pagevec_release(&pvec); | 455 | huge_pagevec_release(&pvec); |
476 | cond_resched(); | 456 | cond_resched(); |
@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
482 | 462 | ||
483 | static void hugetlbfs_evict_inode(struct inode *inode) | 463 | static void hugetlbfs_evict_inode(struct inode *inode) |
484 | { | 464 | { |
465 | struct address_space *mapping = inode->i_mapping; | ||
485 | struct resv_map *resv_map; | 466 | struct resv_map *resv_map; |
486 | 467 | ||
468 | /* | ||
469 | * The vfs layer guarantees that there are no other users of this | ||
470 | * inode. Therefore, it would be safe to call remove_inode_hugepages | ||
471 | * without holding i_mmap_rwsem. We acquire and hold here to be | ||
472 | * consistent with other callers. Since there will be no contention | ||
473 | * on the semaphore, overhead is negligible. | ||
474 | */ | ||
475 | i_mmap_lock_write(mapping); | ||
487 | remove_inode_hugepages(inode, 0, LLONG_MAX); | 476 | remove_inode_hugepages(inode, 0, LLONG_MAX); |
477 | i_mmap_unlock_write(mapping); | ||
478 | |||
488 | resv_map = (struct resv_map *)inode->i_mapping->private_data; | 479 | resv_map = (struct resv_map *)inode->i_mapping->private_data; |
489 | /* root inode doesn't have the resv_map, so we should check it */ | 480 | /* root inode doesn't have the resv_map, so we should check it */ |
490 | if (resv_map) | 481 | if (resv_map) |
@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
505 | i_mmap_lock_write(mapping); | 496 | i_mmap_lock_write(mapping); |
506 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) | 497 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) |
507 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); | 498 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); |
508 | i_mmap_unlock_write(mapping); | ||
509 | remove_inode_hugepages(inode, offset, LLONG_MAX); | 499 | remove_inode_hugepages(inode, offset, LLONG_MAX); |
500 | i_mmap_unlock_write(mapping); | ||
510 | return 0; | 501 | return 0; |
511 | } | 502 | } |
512 | 503 | ||
@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
540 | hugetlb_vmdelete_list(&mapping->i_mmap, | 531 | hugetlb_vmdelete_list(&mapping->i_mmap, |
541 | hole_start >> PAGE_SHIFT, | 532 | hole_start >> PAGE_SHIFT, |
542 | hole_end >> PAGE_SHIFT); | 533 | hole_end >> PAGE_SHIFT); |
543 | i_mmap_unlock_write(mapping); | ||
544 | remove_inode_hugepages(inode, hole_start, hole_end); | 534 | remove_inode_hugepages(inode, hole_start, hole_end); |
535 | i_mmap_unlock_write(mapping); | ||
545 | inode_unlock(inode); | 536 | inode_unlock(inode); |
546 | } | 537 | } |
547 | 538 | ||
@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, | |||
624 | /* addr is the offset within the file (zero based) */ | 615 | /* addr is the offset within the file (zero based) */ |
625 | addr = index * hpage_size; | 616 | addr = index * hpage_size; |
626 | 617 | ||
627 | /* mutex taken here, fault path and hole punch */ | 618 | /* |
619 | * fault mutex taken here, protects against fault path | ||
620 | * and hole punch. inode_lock previously taken protects | ||
621 | * against truncation. | ||
622 | */ | ||
628 | hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, | 623 | hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, |
629 | index, addr); | 624 | index, addr); |
630 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 625 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87fd3ab809c6..e37efd5d8318 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, | |||
3755 | } | 3755 | } |
3756 | 3756 | ||
3757 | /* | 3757 | /* |
3758 | * Use page lock to guard against racing truncation | 3758 | * We can not race with truncation due to holding i_mmap_rwsem. |
3759 | * before we get page_table_lock. | 3759 | * Check once here for faults beyond end of file. |
3760 | */ | 3760 | */ |
3761 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3762 | if (idx >= size) | ||
3763 | goto out; | ||
3764 | |||
3761 | retry: | 3765 | retry: |
3762 | page = find_lock_page(mapping, idx); | 3766 | page = find_lock_page(mapping, idx); |
3763 | if (!page) { | 3767 | if (!page) { |
3764 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3765 | if (idx >= size) | ||
3766 | goto out; | ||
3767 | |||
3768 | /* | 3768 | /* |
3769 | * Check for page in userfault range | 3769 | * Check for page in userfault range |
3770 | */ | 3770 | */ |
@@ -3854,9 +3854,6 @@ retry: | |||
3854 | } | 3854 | } |
3855 | 3855 | ||
3856 | ptl = huge_pte_lock(h, mm, ptep); | 3856 | ptl = huge_pte_lock(h, mm, ptep); |
3857 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3858 | if (idx >= size) | ||
3859 | goto backout; | ||
3860 | 3857 | ||
3861 | ret = 0; | 3858 | ret = 0; |
3862 | if (!huge_pte_none(huge_ptep_get(ptep))) | 3859 | if (!huge_pte_none(huge_ptep_get(ptep))) |
@@ -3959,8 +3956,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3959 | 3956 | ||
3960 | /* | 3957 | /* |
3961 | * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold | 3958 | * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold |
3962 | * until finished with ptep. This prevents huge_pmd_unshare from | 3959 | * until finished with ptep. This serves two purposes: |
3963 | * being called elsewhere and making the ptep no longer valid. | 3960 | * 1) It prevents huge_pmd_unshare from being called elsewhere |
3961 | * and making the ptep no longer valid. | ||
3962 | * 2) It synchronizes us with file truncation. | ||
3964 | * | 3963 | * |
3965 | * ptep could have already be assigned via huge_pte_offset. That | 3964 | * ptep could have already be assigned via huge_pte_offset. That |
3966 | * is OK, as huge_pte_alloc will return the same value unless | 3965 | * is OK, as huge_pte_alloc will return the same value unless |