diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 422 |
1 files changed, 253 insertions, 169 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..54e968650855 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -42,9 +42,6 @@ | |||
| 42 | 42 | ||
| 43 | #include <asm/mman.h> | 43 | #include <asm/mman.h> |
| 44 | 44 | ||
| 45 | static ssize_t | ||
| 46 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
| 47 | loff_t offset, unsigned long nr_segs); | ||
| 48 | 45 | ||
| 49 | /* | 46 | /* |
| 50 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 47 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
| @@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
| 112 | /* | 109 | /* |
| 113 | * Remove a page from the page cache and free it. Caller has to make | 110 | * Remove a page from the page cache and free it. Caller has to make |
| 114 | * sure the page is locked and that nobody else uses it - or that usage | 111 | * sure the page is locked and that nobody else uses it - or that usage |
| 115 | * is safe. The caller must hold a write_lock on the mapping's tree_lock. | 112 | * is safe. The caller must hold the mapping's tree_lock. |
| 116 | */ | 113 | */ |
| 117 | void __remove_from_page_cache(struct page *page) | 114 | void __remove_from_page_cache(struct page *page) |
| 118 | { | 115 | { |
| 119 | struct address_space *mapping = page->mapping; | 116 | struct address_space *mapping = page->mapping; |
| 120 | 117 | ||
| 121 | mem_cgroup_uncharge_page(page); | 118 | mem_cgroup_uncharge_cache_page(page); |
| 122 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
| 123 | page->mapping = NULL; | 120 | page->mapping = NULL; |
| 124 | mapping->nrpages--; | 121 | mapping->nrpages--; |
| @@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page) | |||
| 144 | 141 | ||
| 145 | BUG_ON(!PageLocked(page)); | 142 | BUG_ON(!PageLocked(page)); |
| 146 | 143 | ||
| 147 | write_lock_irq(&mapping->tree_lock); | 144 | spin_lock_irq(&mapping->tree_lock); |
| 148 | __remove_from_page_cache(page); | 145 | __remove_from_page_cache(page); |
| 149 | write_unlock_irq(&mapping->tree_lock); | 146 | spin_unlock_irq(&mapping->tree_lock); |
| 150 | } | 147 | } |
| 151 | 148 | ||
| 152 | static int sync_page(void *word) | 149 | static int sync_page(void *word) |
| @@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
| 445 | } | 442 | } |
| 446 | 443 | ||
| 447 | /** | 444 | /** |
| 448 | * add_to_page_cache - add newly allocated pagecache pages | 445 | * add_to_page_cache_locked - add a locked page to the pagecache |
| 449 | * @page: page to add | 446 | * @page: page to add |
| 450 | * @mapping: the page's address_space | 447 | * @mapping: the page's address_space |
| 451 | * @offset: page index | 448 | * @offset: page index |
| 452 | * @gfp_mask: page allocation mode | 449 | * @gfp_mask: page allocation mode |
| 453 | * | 450 | * |
| 454 | * This function is used to add newly allocated pagecache pages; | 451 | * This function is used to add a page to the pagecache. It must be locked. |
| 455 | * the page is new, so we can just run SetPageLocked() against it. | ||
| 456 | * The other page state flags were set by rmqueue(). | ||
| 457 | * | ||
| 458 | * This function does not add the page to the LRU. The caller must do that. | 452 | * This function does not add the page to the LRU. The caller must do that. |
| 459 | */ | 453 | */ |
| 460 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 454 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
| 461 | pgoff_t offset, gfp_t gfp_mask) | 455 | pgoff_t offset, gfp_t gfp_mask) |
| 462 | { | 456 | { |
| 463 | int error = mem_cgroup_cache_charge(page, current->mm, | 457 | int error; |
| 458 | |||
| 459 | VM_BUG_ON(!PageLocked(page)); | ||
| 460 | |||
| 461 | error = mem_cgroup_cache_charge(page, current->mm, | ||
| 464 | gfp_mask & ~__GFP_HIGHMEM); | 462 | gfp_mask & ~__GFP_HIGHMEM); |
| 465 | if (error) | 463 | if (error) |
| 466 | goto out; | 464 | goto out; |
| 467 | 465 | ||
| 468 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 466 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
| 469 | if (error == 0) { | 467 | if (error == 0) { |
| 470 | write_lock_irq(&mapping->tree_lock); | 468 | page_cache_get(page); |
| 469 | page->mapping = mapping; | ||
| 470 | page->index = offset; | ||
| 471 | |||
| 472 | spin_lock_irq(&mapping->tree_lock); | ||
| 471 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 473 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
| 472 | if (!error) { | 474 | if (likely(!error)) { |
| 473 | page_cache_get(page); | ||
| 474 | SetPageLocked(page); | ||
| 475 | page->mapping = mapping; | ||
| 476 | page->index = offset; | ||
| 477 | mapping->nrpages++; | 475 | mapping->nrpages++; |
| 478 | __inc_zone_page_state(page, NR_FILE_PAGES); | 476 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 479 | } else | 477 | } else { |
| 480 | mem_cgroup_uncharge_page(page); | 478 | page->mapping = NULL; |
| 479 | mem_cgroup_uncharge_cache_page(page); | ||
| 480 | page_cache_release(page); | ||
| 481 | } | ||
| 481 | 482 | ||
| 482 | write_unlock_irq(&mapping->tree_lock); | 483 | spin_unlock_irq(&mapping->tree_lock); |
| 483 | radix_tree_preload_end(); | 484 | radix_tree_preload_end(); |
| 484 | } else | 485 | } else |
| 485 | mem_cgroup_uncharge_page(page); | 486 | mem_cgroup_uncharge_cache_page(page); |
| 486 | out: | 487 | out: |
| 487 | return error; | 488 | return error; |
| 488 | } | 489 | } |
| 489 | EXPORT_SYMBOL(add_to_page_cache); | 490 | EXPORT_SYMBOL(add_to_page_cache_locked); |
| 490 | 491 | ||
| 491 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 492 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
| 492 | pgoff_t offset, gfp_t gfp_mask) | 493 | pgoff_t offset, gfp_t gfp_mask) |
| @@ -557,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
| 557 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 558 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
| 558 | * | 559 | * |
| 559 | * The first mb is necessary to safely close the critical section opened by the | 560 | * The first mb is necessary to safely close the critical section opened by the |
| 560 | * TestSetPageLocked(), the second mb is necessary to enforce ordering between | 561 | * test_and_set_bit() to lock the page; the second mb is necessary to enforce |
| 561 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | 562 | * ordering between the clear_bit and the read of the waitqueue (to avoid SMP |
| 562 | * parallel wait_on_page_locked()). | 563 | * races with a parallel wait_on_page_locked()). |
| 563 | */ | 564 | */ |
| 564 | void unlock_page(struct page *page) | 565 | void unlock_page(struct page *page) |
| 565 | { | 566 | { |
| 566 | smp_mb__before_clear_bit(); | 567 | smp_mb__before_clear_bit(); |
| 567 | if (!TestClearPageLocked(page)) | 568 | if (!test_and_clear_bit(PG_locked, &page->flags)) |
| 568 | BUG(); | 569 | BUG(); |
| 569 | smp_mb__after_clear_bit(); | 570 | smp_mb__after_clear_bit(); |
| 570 | wake_up_page(page, PG_locked); | 571 | wake_up_page(page, PG_locked); |
| @@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page) | |||
| 636 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 637 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
| 637 | * If yes, increment its refcount and return it; if no, return NULL. | 638 | * If yes, increment its refcount and return it; if no, return NULL. |
| 638 | */ | 639 | */ |
| 639 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) | 640 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) |
| 640 | { | 641 | { |
| 642 | void **pagep; | ||
| 641 | struct page *page; | 643 | struct page *page; |
| 642 | 644 | ||
| 643 | read_lock_irq(&mapping->tree_lock); | 645 | rcu_read_lock(); |
| 644 | page = radix_tree_lookup(&mapping->page_tree, offset); | 646 | repeat: |
| 645 | if (page) | 647 | page = NULL; |
| 646 | page_cache_get(page); | 648 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
| 647 | read_unlock_irq(&mapping->tree_lock); | 649 | if (pagep) { |
| 650 | page = radix_tree_deref_slot(pagep); | ||
| 651 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | ||
| 652 | goto repeat; | ||
| 653 | |||
| 654 | if (!page_cache_get_speculative(page)) | ||
| 655 | goto repeat; | ||
| 656 | |||
| 657 | /* | ||
| 658 | * Has the page moved? | ||
| 659 | * This is part of the lockless pagecache protocol. See | ||
| 660 | * include/linux/pagemap.h for details. | ||
| 661 | */ | ||
| 662 | if (unlikely(page != *pagep)) { | ||
| 663 | page_cache_release(page); | ||
| 664 | goto repeat; | ||
| 665 | } | ||
| 666 | } | ||
| 667 | rcu_read_unlock(); | ||
| 668 | |||
| 648 | return page; | 669 | return page; |
| 649 | } | 670 | } |
| 650 | EXPORT_SYMBOL(find_get_page); | 671 | EXPORT_SYMBOL(find_get_page); |
| @@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); | |||
| 659 | * | 680 | * |
| 660 | * Returns zero if the page was not present. find_lock_page() may sleep. | 681 | * Returns zero if the page was not present. find_lock_page() may sleep. |
| 661 | */ | 682 | */ |
| 662 | struct page *find_lock_page(struct address_space *mapping, | 683 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) |
| 663 | pgoff_t offset) | ||
| 664 | { | 684 | { |
| 665 | struct page *page; | 685 | struct page *page; |
| 666 | 686 | ||
| 667 | repeat: | 687 | repeat: |
| 668 | read_lock_irq(&mapping->tree_lock); | 688 | page = find_get_page(mapping, offset); |
| 669 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
| 670 | if (page) { | 689 | if (page) { |
| 671 | page_cache_get(page); | 690 | lock_page(page); |
| 672 | if (TestSetPageLocked(page)) { | 691 | /* Has the page been truncated? */ |
| 673 | read_unlock_irq(&mapping->tree_lock); | 692 | if (unlikely(page->mapping != mapping)) { |
| 674 | __lock_page(page); | 693 | unlock_page(page); |
| 675 | 694 | page_cache_release(page); | |
| 676 | /* Has the page been truncated while we slept? */ | 695 | goto repeat; |
| 677 | if (unlikely(page->mapping != mapping)) { | ||
| 678 | unlock_page(page); | ||
| 679 | page_cache_release(page); | ||
| 680 | goto repeat; | ||
| 681 | } | ||
| 682 | VM_BUG_ON(page->index != offset); | ||
| 683 | goto out; | ||
| 684 | } | 696 | } |
| 697 | VM_BUG_ON(page->index != offset); | ||
| 685 | } | 698 | } |
| 686 | read_unlock_irq(&mapping->tree_lock); | ||
| 687 | out: | ||
| 688 | return page; | 699 | return page; |
| 689 | } | 700 | } |
| 690 | EXPORT_SYMBOL(find_lock_page); | 701 | EXPORT_SYMBOL(find_lock_page); |
| @@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
| 750 | { | 761 | { |
| 751 | unsigned int i; | 762 | unsigned int i; |
| 752 | unsigned int ret; | 763 | unsigned int ret; |
| 764 | unsigned int nr_found; | ||
| 765 | |||
| 766 | rcu_read_lock(); | ||
| 767 | restart: | ||
| 768 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
| 769 | (void ***)pages, start, nr_pages); | ||
| 770 | ret = 0; | ||
| 771 | for (i = 0; i < nr_found; i++) { | ||
| 772 | struct page *page; | ||
| 773 | repeat: | ||
| 774 | page = radix_tree_deref_slot((void **)pages[i]); | ||
| 775 | if (unlikely(!page)) | ||
| 776 | continue; | ||
| 777 | /* | ||
| 778 | * this can only trigger if nr_found == 1, making livelock | ||
| 779 | * a non issue. | ||
| 780 | */ | ||
| 781 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
| 782 | goto restart; | ||
| 753 | 783 | ||
| 754 | read_lock_irq(&mapping->tree_lock); | 784 | if (!page_cache_get_speculative(page)) |
| 755 | ret = radix_tree_gang_lookup(&mapping->page_tree, | 785 | goto repeat; |
| 756 | (void **)pages, start, nr_pages); | 786 | |
| 757 | for (i = 0; i < ret; i++) | 787 | /* Has the page moved? */ |
| 758 | page_cache_get(pages[i]); | 788 | if (unlikely(page != *((void **)pages[i]))) { |
| 759 | read_unlock_irq(&mapping->tree_lock); | 789 | page_cache_release(page); |
| 790 | goto repeat; | ||
| 791 | } | ||
| 792 | |||
| 793 | pages[ret] = page; | ||
| 794 | ret++; | ||
| 795 | } | ||
| 796 | rcu_read_unlock(); | ||
| 760 | return ret; | 797 | return ret; |
| 761 | } | 798 | } |
| 762 | 799 | ||
| @@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
| 777 | { | 814 | { |
| 778 | unsigned int i; | 815 | unsigned int i; |
| 779 | unsigned int ret; | 816 | unsigned int ret; |
| 817 | unsigned int nr_found; | ||
| 818 | |||
| 819 | rcu_read_lock(); | ||
| 820 | restart: | ||
| 821 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
| 822 | (void ***)pages, index, nr_pages); | ||
| 823 | ret = 0; | ||
| 824 | for (i = 0; i < nr_found; i++) { | ||
| 825 | struct page *page; | ||
| 826 | repeat: | ||
| 827 | page = radix_tree_deref_slot((void **)pages[i]); | ||
| 828 | if (unlikely(!page)) | ||
| 829 | continue; | ||
| 830 | /* | ||
| 831 | * this can only trigger if nr_found == 1, making livelock | ||
| 832 | * a non issue. | ||
| 833 | */ | ||
| 834 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
| 835 | goto restart; | ||
| 780 | 836 | ||
| 781 | read_lock_irq(&mapping->tree_lock); | 837 | if (page->mapping == NULL || page->index != index) |
| 782 | ret = radix_tree_gang_lookup(&mapping->page_tree, | ||
| 783 | (void **)pages, index, nr_pages); | ||
| 784 | for (i = 0; i < ret; i++) { | ||
| 785 | if (pages[i]->mapping == NULL || pages[i]->index != index) | ||
| 786 | break; | 838 | break; |
| 787 | 839 | ||
| 788 | page_cache_get(pages[i]); | 840 | if (!page_cache_get_speculative(page)) |
| 841 | goto repeat; | ||
| 842 | |||
| 843 | /* Has the page moved? */ | ||
| 844 | if (unlikely(page != *((void **)pages[i]))) { | ||
| 845 | page_cache_release(page); | ||
| 846 | goto repeat; | ||
| 847 | } | ||
| 848 | |||
| 849 | pages[ret] = page; | ||
| 850 | ret++; | ||
| 789 | index++; | 851 | index++; |
| 790 | } | 852 | } |
| 791 | read_unlock_irq(&mapping->tree_lock); | 853 | rcu_read_unlock(); |
| 792 | return i; | 854 | return ret; |
| 793 | } | 855 | } |
| 794 | EXPORT_SYMBOL(find_get_pages_contig); | 856 | EXPORT_SYMBOL(find_get_pages_contig); |
| 795 | 857 | ||
| @@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
| 809 | { | 871 | { |
| 810 | unsigned int i; | 872 | unsigned int i; |
| 811 | unsigned int ret; | 873 | unsigned int ret; |
| 874 | unsigned int nr_found; | ||
| 875 | |||
| 876 | rcu_read_lock(); | ||
| 877 | restart: | ||
| 878 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, | ||
| 879 | (void ***)pages, *index, nr_pages, tag); | ||
| 880 | ret = 0; | ||
| 881 | for (i = 0; i < nr_found; i++) { | ||
| 882 | struct page *page; | ||
| 883 | repeat: | ||
| 884 | page = radix_tree_deref_slot((void **)pages[i]); | ||
| 885 | if (unlikely(!page)) | ||
| 886 | continue; | ||
| 887 | /* | ||
| 888 | * this can only trigger if nr_found == 1, making livelock | ||
| 889 | * a non issue. | ||
| 890 | */ | ||
| 891 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
| 892 | goto restart; | ||
| 893 | |||
| 894 | if (!page_cache_get_speculative(page)) | ||
| 895 | goto repeat; | ||
| 896 | |||
| 897 | /* Has the page moved? */ | ||
| 898 | if (unlikely(page != *((void **)pages[i]))) { | ||
| 899 | page_cache_release(page); | ||
| 900 | goto repeat; | ||
| 901 | } | ||
| 902 | |||
| 903 | pages[ret] = page; | ||
| 904 | ret++; | ||
| 905 | } | ||
| 906 | rcu_read_unlock(); | ||
| 812 | 907 | ||
| 813 | read_lock_irq(&mapping->tree_lock); | ||
| 814 | ret = radix_tree_gang_lookup_tag(&mapping->page_tree, | ||
| 815 | (void **)pages, *index, nr_pages, tag); | ||
| 816 | for (i = 0; i < ret; i++) | ||
| 817 | page_cache_get(pages[i]); | ||
| 818 | if (ret) | 908 | if (ret) |
| 819 | *index = pages[ret - 1]->index + 1; | 909 | *index = pages[ret - 1]->index + 1; |
| 820 | read_unlock_irq(&mapping->tree_lock); | 910 | |
| 821 | return ret; | 911 | return ret; |
| 822 | } | 912 | } |
| 823 | EXPORT_SYMBOL(find_get_pages_tag); | 913 | EXPORT_SYMBOL(find_get_pages_tag); |
| @@ -841,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
| 841 | struct page *page = find_get_page(mapping, index); | 931 | struct page *page = find_get_page(mapping, index); |
| 842 | 932 | ||
| 843 | if (page) { | 933 | if (page) { |
| 844 | if (!TestSetPageLocked(page)) | 934 | if (trylock_page(page)) |
| 845 | return page; | 935 | return page; |
| 846 | page_cache_release(page); | 936 | page_cache_release(page); |
| 847 | return NULL; | 937 | return NULL; |
| @@ -933,8 +1023,17 @@ find_page: | |||
| 933 | ra, filp, page, | 1023 | ra, filp, page, |
| 934 | index, last_index - index); | 1024 | index, last_index - index); |
| 935 | } | 1025 | } |
| 936 | if (!PageUptodate(page)) | 1026 | if (!PageUptodate(page)) { |
| 937 | goto page_not_up_to_date; | 1027 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
| 1028 | !mapping->a_ops->is_partially_uptodate) | ||
| 1029 | goto page_not_up_to_date; | ||
| 1030 | if (!trylock_page(page)) | ||
| 1031 | goto page_not_up_to_date; | ||
| 1032 | if (!mapping->a_ops->is_partially_uptodate(page, | ||
| 1033 | desc, offset)) | ||
| 1034 | goto page_not_up_to_date_locked; | ||
| 1035 | unlock_page(page); | ||
| 1036 | } | ||
| 938 | page_ok: | 1037 | page_ok: |
| 939 | /* | 1038 | /* |
| 940 | * i_size must be checked after we know the page is Uptodate. | 1039 | * i_size must be checked after we know the page is Uptodate. |
| @@ -1004,6 +1103,7 @@ page_not_up_to_date: | |||
| 1004 | if (lock_page_killable(page)) | 1103 | if (lock_page_killable(page)) |
| 1005 | goto readpage_eio; | 1104 | goto readpage_eio; |
| 1006 | 1105 | ||
| 1106 | page_not_up_to_date_locked: | ||
| 1007 | /* Did it get truncated before we got the lock? */ | 1107 | /* Did it get truncated before we got the lock? */ |
| 1008 | if (!page->mapping) { | 1108 | if (!page->mapping) { |
| 1009 | unlock_page(page); | 1109 | unlock_page(page); |
| @@ -1200,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 1200 | 1300 | ||
| 1201 | mapping = filp->f_mapping; | 1301 | mapping = filp->f_mapping; |
| 1202 | inode = mapping->host; | 1302 | inode = mapping->host; |
| 1203 | retval = 0; | ||
| 1204 | if (!count) | 1303 | if (!count) |
| 1205 | goto out; /* skip atime */ | 1304 | goto out; /* skip atime */ |
| 1206 | size = i_size_read(inode); | 1305 | size = i_size_read(inode); |
| 1207 | if (pos < size) { | 1306 | if (pos < size) { |
| 1208 | retval = generic_file_direct_IO(READ, iocb, | 1307 | retval = filemap_write_and_wait(mapping); |
| 1209 | iov, pos, nr_segs); | 1308 | if (!retval) { |
| 1309 | retval = mapping->a_ops->direct_IO(READ, iocb, | ||
| 1310 | iov, pos, nr_segs); | ||
| 1311 | } | ||
| 1210 | if (retval > 0) | 1312 | if (retval > 0) |
| 1211 | *ppos = pos + retval; | 1313 | *ppos = pos + retval; |
| 1212 | } | 1314 | if (retval) { |
| 1213 | if (likely(retval != 0)) { | 1315 | file_accessed(filp); |
| 1214 | file_accessed(filp); | 1316 | goto out; |
| 1215 | goto out; | 1317 | } |
| 1216 | } | 1318 | } |
| 1217 | } | 1319 | } |
| 1218 | 1320 | ||
| 1219 | retval = 0; | 1321 | for (seg = 0; seg < nr_segs; seg++) { |
| 1220 | if (count) { | 1322 | read_descriptor_t desc; |
| 1221 | for (seg = 0; seg < nr_segs; seg++) { | ||
| 1222 | read_descriptor_t desc; | ||
| 1223 | 1323 | ||
| 1224 | desc.written = 0; | 1324 | desc.written = 0; |
| 1225 | desc.arg.buf = iov[seg].iov_base; | 1325 | desc.arg.buf = iov[seg].iov_base; |
| 1226 | desc.count = iov[seg].iov_len; | 1326 | desc.count = iov[seg].iov_len; |
| 1227 | if (desc.count == 0) | 1327 | if (desc.count == 0) |
| 1228 | continue; | 1328 | continue; |
| 1229 | desc.error = 0; | 1329 | desc.error = 0; |
| 1230 | do_generic_file_read(filp,ppos,&desc,file_read_actor); | 1330 | do_generic_file_read(filp, ppos, &desc, file_read_actor); |
| 1231 | retval += desc.written; | 1331 | retval += desc.written; |
| 1232 | if (desc.error) { | 1332 | if (desc.error) { |
| 1233 | retval = retval ?: desc.error; | 1333 | retval = retval ?: desc.error; |
| 1234 | break; | 1334 | break; |
| 1235 | } | ||
| 1236 | if (desc.count > 0) | ||
| 1237 | break; | ||
| 1238 | } | 1335 | } |
| 1336 | if (desc.count > 0) | ||
| 1337 | break; | ||
| 1239 | } | 1338 | } |
| 1240 | out: | 1339 | out: |
| 1241 | return retval; | 1340 | return retval; |
| @@ -1669,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill) | |||
| 1669 | return notify_change(dentry, &newattrs); | 1768 | return notify_change(dentry, &newattrs); |
| 1670 | } | 1769 | } |
| 1671 | 1770 | ||
| 1672 | int remove_suid(struct dentry *dentry) | 1771 | int file_remove_suid(struct file *file) |
| 1673 | { | 1772 | { |
| 1773 | struct dentry *dentry = file->f_path.dentry; | ||
| 1674 | int killsuid = should_remove_suid(dentry); | 1774 | int killsuid = should_remove_suid(dentry); |
| 1675 | int killpriv = security_inode_need_killpriv(dentry); | 1775 | int killpriv = security_inode_need_killpriv(dentry); |
| 1676 | int error = 0; | 1776 | int error = 0; |
| @@ -1684,7 +1784,7 @@ int remove_suid(struct dentry *dentry) | |||
| 1684 | 1784 | ||
| 1685 | return error; | 1785 | return error; |
| 1686 | } | 1786 | } |
| 1687 | EXPORT_SYMBOL(remove_suid); | 1787 | EXPORT_SYMBOL(file_remove_suid); |
| 1688 | 1788 | ||
| 1689 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1789 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
| 1690 | const struct iovec *iov, size_t base, size_t bytes) | 1790 | const struct iovec *iov, size_t base, size_t bytes) |
| @@ -1779,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
| 1779 | * The !iov->iov_len check ensures we skip over unlikely | 1879 | * The !iov->iov_len check ensures we skip over unlikely |
| 1780 | * zero-length segments (without overruning the iovec). | 1880 | * zero-length segments (without overruning the iovec). |
| 1781 | */ | 1881 | */ |
| 1782 | while (bytes || unlikely(!iov->iov_len && i->count)) { | 1882 | while (bytes || unlikely(i->count && !iov->iov_len)) { |
| 1783 | int copy; | 1883 | int copy; |
| 1784 | 1884 | ||
| 1785 | copy = min(bytes, iov->iov_len - base); | 1885 | copy = min(bytes, iov->iov_len - base); |
| @@ -2004,11 +2104,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2004 | struct address_space *mapping = file->f_mapping; | 2104 | struct address_space *mapping = file->f_mapping; |
| 2005 | struct inode *inode = mapping->host; | 2105 | struct inode *inode = mapping->host; |
| 2006 | ssize_t written; | 2106 | ssize_t written; |
| 2107 | size_t write_len; | ||
| 2108 | pgoff_t end; | ||
| 2007 | 2109 | ||
| 2008 | if (count != ocount) | 2110 | if (count != ocount) |
| 2009 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2111 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
| 2010 | 2112 | ||
| 2011 | written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); | 2113 | /* |
| 2114 | * Unmap all mmappings of the file up-front. | ||
| 2115 | * | ||
| 2116 | * This will cause any pte dirty bits to be propagated into the | ||
| 2117 | * pageframes for the subsequent filemap_write_and_wait(). | ||
| 2118 | */ | ||
| 2119 | write_len = iov_length(iov, *nr_segs); | ||
| 2120 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
| 2121 | if (mapping_mapped(mapping)) | ||
| 2122 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
| 2123 | |||
| 2124 | written = filemap_write_and_wait(mapping); | ||
| 2125 | if (written) | ||
| 2126 | goto out; | ||
| 2127 | |||
| 2128 | /* | ||
| 2129 | * After a write we want buffered reads to be sure to go to disk to get | ||
| 2130 | * the new data. We invalidate clean cached page from the region we're | ||
| 2131 | * about to write. We do this *before* the write so that we can return | ||
| 2132 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
| 2133 | */ | ||
| 2134 | if (mapping->nrpages) { | ||
| 2135 | written = invalidate_inode_pages2_range(mapping, | ||
| 2136 | pos >> PAGE_CACHE_SHIFT, end); | ||
| 2137 | if (written) | ||
| 2138 | goto out; | ||
| 2139 | } | ||
| 2140 | |||
| 2141 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); | ||
| 2142 | |||
| 2143 | /* | ||
| 2144 | * Finally, try again to invalidate clean pages which might have been | ||
| 2145 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
| 2146 | * if the source of the write was an mmap'ed region of the file | ||
| 2147 | * we're writing. Either one is a pretty crazy thing to do, | ||
| 2148 | * so we don't support it 100%. If this invalidation | ||
| 2149 | * fails, tough, the write still worked... | ||
| 2150 | */ | ||
| 2151 | if (mapping->nrpages) { | ||
| 2152 | invalidate_inode_pages2_range(mapping, | ||
| 2153 | pos >> PAGE_CACHE_SHIFT, end); | ||
| 2154 | } | ||
| 2155 | |||
| 2012 | if (written > 0) { | 2156 | if (written > 0) { |
| 2013 | loff_t end = pos + written; | 2157 | loff_t end = pos + written; |
| 2014 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { | 2158 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { |
| @@ -2024,6 +2168,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2024 | * i_mutex is held, which protects generic_osync_inode() from | 2168 | * i_mutex is held, which protects generic_osync_inode() from |
| 2025 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. | 2169 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
| 2026 | */ | 2170 | */ |
| 2171 | out: | ||
| 2027 | if ((written >= 0 || written == -EIOCBQUEUED) && | 2172 | if ((written >= 0 || written == -EIOCBQUEUED) && |
| 2028 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2173 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 2029 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2174 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
| @@ -2395,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
| 2395 | if (count == 0) | 2540 | if (count == 0) |
| 2396 | goto out; | 2541 | goto out; |
| 2397 | 2542 | ||
| 2398 | err = remove_suid(file->f_path.dentry); | 2543 | err = file_remove_suid(file); |
| 2399 | if (err) | 2544 | if (err) |
| 2400 | goto out; | 2545 | goto out; |
| 2401 | 2546 | ||
| @@ -2511,66 +2656,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2511 | } | 2656 | } |
| 2512 | EXPORT_SYMBOL(generic_file_aio_write); | 2657 | EXPORT_SYMBOL(generic_file_aio_write); |
| 2513 | 2658 | ||
| 2514 | /* | ||
| 2515 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something | ||
| 2516 | * went wrong during pagecache shootdown. | ||
| 2517 | */ | ||
| 2518 | static ssize_t | ||
| 2519 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
| 2520 | loff_t offset, unsigned long nr_segs) | ||
| 2521 | { | ||
| 2522 | struct file *file = iocb->ki_filp; | ||
| 2523 | struct address_space *mapping = file->f_mapping; | ||
| 2524 | ssize_t retval; | ||
| 2525 | size_t write_len; | ||
| 2526 | pgoff_t end = 0; /* silence gcc */ | ||
| 2527 | |||
| 2528 | /* | ||
| 2529 | * If it's a write, unmap all mmappings of the file up-front. This | ||
| 2530 | * will cause any pte dirty bits to be propagated into the pageframes | ||
| 2531 | * for the subsequent filemap_write_and_wait(). | ||
| 2532 | */ | ||
| 2533 | if (rw == WRITE) { | ||
| 2534 | write_len = iov_length(iov, nr_segs); | ||
| 2535 | end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
| 2536 | if (mapping_mapped(mapping)) | ||
| 2537 | unmap_mapping_range(mapping, offset, write_len, 0); | ||
| 2538 | } | ||
| 2539 | |||
| 2540 | retval = filemap_write_and_wait(mapping); | ||
| 2541 | if (retval) | ||
| 2542 | goto out; | ||
| 2543 | |||
| 2544 | /* | ||
| 2545 | * After a write we want buffered reads to be sure to go to disk to get | ||
| 2546 | * the new data. We invalidate clean cached page from the region we're | ||
| 2547 | * about to write. We do this *before* the write so that we can return | ||
| 2548 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
| 2549 | */ | ||
| 2550 | if (rw == WRITE && mapping->nrpages) { | ||
| 2551 | retval = invalidate_inode_pages2_range(mapping, | ||
| 2552 | offset >> PAGE_CACHE_SHIFT, end); | ||
| 2553 | if (retval) | ||
| 2554 | goto out; | ||
| 2555 | } | ||
| 2556 | |||
| 2557 | retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); | ||
| 2558 | |||
| 2559 | /* | ||
| 2560 | * Finally, try again to invalidate clean pages which might have been | ||
| 2561 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
| 2562 | * if the source of the write was an mmap'ed region of the file | ||
| 2563 | * we're writing. Either one is a pretty crazy thing to do, | ||
| 2564 | * so we don't support it 100%. If this invalidation | ||
| 2565 | * fails, tough, the write still worked... | ||
| 2566 | */ | ||
| 2567 | if (rw == WRITE && mapping->nrpages) { | ||
| 2568 | invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); | ||
| 2569 | } | ||
| 2570 | out: | ||
| 2571 | return retval; | ||
| 2572 | } | ||
| 2573 | |||
| 2574 | /** | 2659 | /** |
| 2575 | * try_to_release_page() - release old fs-specific metadata on a page | 2660 | * try_to_release_page() - release old fs-specific metadata on a page |
| 2576 | * | 2661 | * |
| @@ -2582,9 +2667,8 @@ out: | |||
| 2582 | * Otherwise return zero. | 2667 | * Otherwise return zero. |
| 2583 | * | 2668 | * |
| 2584 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2669 | * The @gfp_mask argument specifies whether I/O may be performed to release |
| 2585 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). | 2670 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
| 2586 | * | 2671 | * |
| 2587 | * NOTE: @gfp_mask may go away, and this function may become non-blocking. | ||
| 2588 | */ | 2672 | */ |
| 2589 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | 2673 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
| 2590 | { | 2674 | { |
