diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-08-14 06:19:59 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-08-14 06:19:59 -0400 |
commit | 8d7ccaa545490cdffdfaff0842436a8dd85cf47b (patch) | |
tree | 8129b5907161bc6ae26deb3645ce1e280c5e1f51 /mm/filemap.c | |
parent | b2139aa0eec330c711c5a279db361e5ef1178e78 (diff) | |
parent | 30a2f3c60a84092c8084dfe788b710f8d0768cd4 (diff) |
Merge commit 'v2.6.27-rc3' into x86/prototypes
Conflicts:
include/asm-x86/dma-mapping.h
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 422 |
1 files changed, 253 insertions, 169 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..54e968650855 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -42,9 +42,6 @@ | |||
42 | 42 | ||
43 | #include <asm/mman.h> | 43 | #include <asm/mman.h> |
44 | 44 | ||
45 | static ssize_t | ||
46 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
47 | loff_t offset, unsigned long nr_segs); | ||
48 | 45 | ||
49 | /* | 46 | /* |
50 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 47 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
@@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
112 | /* | 109 | /* |
113 | * Remove a page from the page cache and free it. Caller has to make | 110 | * Remove a page from the page cache and free it. Caller has to make |
114 | * sure the page is locked and that nobody else uses it - or that usage | 111 | * sure the page is locked and that nobody else uses it - or that usage |
115 | * is safe. The caller must hold a write_lock on the mapping's tree_lock. | 112 | * is safe. The caller must hold the mapping's tree_lock. |
116 | */ | 113 | */ |
117 | void __remove_from_page_cache(struct page *page) | 114 | void __remove_from_page_cache(struct page *page) |
118 | { | 115 | { |
119 | struct address_space *mapping = page->mapping; | 116 | struct address_space *mapping = page->mapping; |
120 | 117 | ||
121 | mem_cgroup_uncharge_page(page); | 118 | mem_cgroup_uncharge_cache_page(page); |
122 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
123 | page->mapping = NULL; | 120 | page->mapping = NULL; |
124 | mapping->nrpages--; | 121 | mapping->nrpages--; |
@@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page) | |||
144 | 141 | ||
145 | BUG_ON(!PageLocked(page)); | 142 | BUG_ON(!PageLocked(page)); |
146 | 143 | ||
147 | write_lock_irq(&mapping->tree_lock); | 144 | spin_lock_irq(&mapping->tree_lock); |
148 | __remove_from_page_cache(page); | 145 | __remove_from_page_cache(page); |
149 | write_unlock_irq(&mapping->tree_lock); | 146 | spin_unlock_irq(&mapping->tree_lock); |
150 | } | 147 | } |
151 | 148 | ||
152 | static int sync_page(void *word) | 149 | static int sync_page(void *word) |
@@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
445 | } | 442 | } |
446 | 443 | ||
447 | /** | 444 | /** |
448 | * add_to_page_cache - add newly allocated pagecache pages | 445 | * add_to_page_cache_locked - add a locked page to the pagecache |
449 | * @page: page to add | 446 | * @page: page to add |
450 | * @mapping: the page's address_space | 447 | * @mapping: the page's address_space |
451 | * @offset: page index | 448 | * @offset: page index |
452 | * @gfp_mask: page allocation mode | 449 | * @gfp_mask: page allocation mode |
453 | * | 450 | * |
454 | * This function is used to add newly allocated pagecache pages; | 451 | * This function is used to add a page to the pagecache. It must be locked. |
455 | * the page is new, so we can just run SetPageLocked() against it. | ||
456 | * The other page state flags were set by rmqueue(). | ||
457 | * | ||
458 | * This function does not add the page to the LRU. The caller must do that. | 452 | * This function does not add the page to the LRU. The caller must do that. |
459 | */ | 453 | */ |
460 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 454 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
461 | pgoff_t offset, gfp_t gfp_mask) | 455 | pgoff_t offset, gfp_t gfp_mask) |
462 | { | 456 | { |
463 | int error = mem_cgroup_cache_charge(page, current->mm, | 457 | int error; |
458 | |||
459 | VM_BUG_ON(!PageLocked(page)); | ||
460 | |||
461 | error = mem_cgroup_cache_charge(page, current->mm, | ||
464 | gfp_mask & ~__GFP_HIGHMEM); | 462 | gfp_mask & ~__GFP_HIGHMEM); |
465 | if (error) | 463 | if (error) |
466 | goto out; | 464 | goto out; |
467 | 465 | ||
468 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 466 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
469 | if (error == 0) { | 467 | if (error == 0) { |
470 | write_lock_irq(&mapping->tree_lock); | 468 | page_cache_get(page); |
469 | page->mapping = mapping; | ||
470 | page->index = offset; | ||
471 | |||
472 | spin_lock_irq(&mapping->tree_lock); | ||
471 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 473 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
472 | if (!error) { | 474 | if (likely(!error)) { |
473 | page_cache_get(page); | ||
474 | SetPageLocked(page); | ||
475 | page->mapping = mapping; | ||
476 | page->index = offset; | ||
477 | mapping->nrpages++; | 475 | mapping->nrpages++; |
478 | __inc_zone_page_state(page, NR_FILE_PAGES); | 476 | __inc_zone_page_state(page, NR_FILE_PAGES); |
479 | } else | 477 | } else { |
480 | mem_cgroup_uncharge_page(page); | 478 | page->mapping = NULL; |
479 | mem_cgroup_uncharge_cache_page(page); | ||
480 | page_cache_release(page); | ||
481 | } | ||
481 | 482 | ||
482 | write_unlock_irq(&mapping->tree_lock); | 483 | spin_unlock_irq(&mapping->tree_lock); |
483 | radix_tree_preload_end(); | 484 | radix_tree_preload_end(); |
484 | } else | 485 | } else |
485 | mem_cgroup_uncharge_page(page); | 486 | mem_cgroup_uncharge_cache_page(page); |
486 | out: | 487 | out: |
487 | return error; | 488 | return error; |
488 | } | 489 | } |
489 | EXPORT_SYMBOL(add_to_page_cache); | 490 | EXPORT_SYMBOL(add_to_page_cache_locked); |
490 | 491 | ||
491 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 492 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
492 | pgoff_t offset, gfp_t gfp_mask) | 493 | pgoff_t offset, gfp_t gfp_mask) |
@@ -557,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
557 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 558 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
558 | * | 559 | * |
559 | * The first mb is necessary to safely close the critical section opened by the | 560 | * The first mb is necessary to safely close the critical section opened by the |
560 | * TestSetPageLocked(), the second mb is necessary to enforce ordering between | 561 | * test_and_set_bit() to lock the page; the second mb is necessary to enforce |
561 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | 562 | * ordering between the clear_bit and the read of the waitqueue (to avoid SMP |
562 | * parallel wait_on_page_locked()). | 563 | * races with a parallel wait_on_page_locked()). |
563 | */ | 564 | */ |
564 | void unlock_page(struct page *page) | 565 | void unlock_page(struct page *page) |
565 | { | 566 | { |
566 | smp_mb__before_clear_bit(); | 567 | smp_mb__before_clear_bit(); |
567 | if (!TestClearPageLocked(page)) | 568 | if (!test_and_clear_bit(PG_locked, &page->flags)) |
568 | BUG(); | 569 | BUG(); |
569 | smp_mb__after_clear_bit(); | 570 | smp_mb__after_clear_bit(); |
570 | wake_up_page(page, PG_locked); | 571 | wake_up_page(page, PG_locked); |
@@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page) | |||
636 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 637 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
637 | * If yes, increment its refcount and return it; if no, return NULL. | 638 | * If yes, increment its refcount and return it; if no, return NULL. |
638 | */ | 639 | */ |
639 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) | 640 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) |
640 | { | 641 | { |
642 | void **pagep; | ||
641 | struct page *page; | 643 | struct page *page; |
642 | 644 | ||
643 | read_lock_irq(&mapping->tree_lock); | 645 | rcu_read_lock(); |
644 | page = radix_tree_lookup(&mapping->page_tree, offset); | 646 | repeat: |
645 | if (page) | 647 | page = NULL; |
646 | page_cache_get(page); | 648 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
647 | read_unlock_irq(&mapping->tree_lock); | 649 | if (pagep) { |
650 | page = radix_tree_deref_slot(pagep); | ||
651 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | ||
652 | goto repeat; | ||
653 | |||
654 | if (!page_cache_get_speculative(page)) | ||
655 | goto repeat; | ||
656 | |||
657 | /* | ||
658 | * Has the page moved? | ||
659 | * This is part of the lockless pagecache protocol. See | ||
660 | * include/linux/pagemap.h for details. | ||
661 | */ | ||
662 | if (unlikely(page != *pagep)) { | ||
663 | page_cache_release(page); | ||
664 | goto repeat; | ||
665 | } | ||
666 | } | ||
667 | rcu_read_unlock(); | ||
668 | |||
648 | return page; | 669 | return page; |
649 | } | 670 | } |
650 | EXPORT_SYMBOL(find_get_page); | 671 | EXPORT_SYMBOL(find_get_page); |
@@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); | |||
659 | * | 680 | * |
660 | * Returns zero if the page was not present. find_lock_page() may sleep. | 681 | * Returns zero if the page was not present. find_lock_page() may sleep. |
661 | */ | 682 | */ |
662 | struct page *find_lock_page(struct address_space *mapping, | 683 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) |
663 | pgoff_t offset) | ||
664 | { | 684 | { |
665 | struct page *page; | 685 | struct page *page; |
666 | 686 | ||
667 | repeat: | 687 | repeat: |
668 | read_lock_irq(&mapping->tree_lock); | 688 | page = find_get_page(mapping, offset); |
669 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
670 | if (page) { | 689 | if (page) { |
671 | page_cache_get(page); | 690 | lock_page(page); |
672 | if (TestSetPageLocked(page)) { | 691 | /* Has the page been truncated? */ |
673 | read_unlock_irq(&mapping->tree_lock); | 692 | if (unlikely(page->mapping != mapping)) { |
674 | __lock_page(page); | 693 | unlock_page(page); |
675 | 694 | page_cache_release(page); | |
676 | /* Has the page been truncated while we slept? */ | 695 | goto repeat; |
677 | if (unlikely(page->mapping != mapping)) { | ||
678 | unlock_page(page); | ||
679 | page_cache_release(page); | ||
680 | goto repeat; | ||
681 | } | ||
682 | VM_BUG_ON(page->index != offset); | ||
683 | goto out; | ||
684 | } | 696 | } |
697 | VM_BUG_ON(page->index != offset); | ||
685 | } | 698 | } |
686 | read_unlock_irq(&mapping->tree_lock); | ||
687 | out: | ||
688 | return page; | 699 | return page; |
689 | } | 700 | } |
690 | EXPORT_SYMBOL(find_lock_page); | 701 | EXPORT_SYMBOL(find_lock_page); |
@@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
750 | { | 761 | { |
751 | unsigned int i; | 762 | unsigned int i; |
752 | unsigned int ret; | 763 | unsigned int ret; |
764 | unsigned int nr_found; | ||
765 | |||
766 | rcu_read_lock(); | ||
767 | restart: | ||
768 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
769 | (void ***)pages, start, nr_pages); | ||
770 | ret = 0; | ||
771 | for (i = 0; i < nr_found; i++) { | ||
772 | struct page *page; | ||
773 | repeat: | ||
774 | page = radix_tree_deref_slot((void **)pages[i]); | ||
775 | if (unlikely(!page)) | ||
776 | continue; | ||
777 | /* | ||
778 | * this can only trigger if nr_found == 1, making livelock | ||
779 | * a non issue. | ||
780 | */ | ||
781 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
782 | goto restart; | ||
753 | 783 | ||
754 | read_lock_irq(&mapping->tree_lock); | 784 | if (!page_cache_get_speculative(page)) |
755 | ret = radix_tree_gang_lookup(&mapping->page_tree, | 785 | goto repeat; |
756 | (void **)pages, start, nr_pages); | 786 | |
757 | for (i = 0; i < ret; i++) | 787 | /* Has the page moved? */ |
758 | page_cache_get(pages[i]); | 788 | if (unlikely(page != *((void **)pages[i]))) { |
759 | read_unlock_irq(&mapping->tree_lock); | 789 | page_cache_release(page); |
790 | goto repeat; | ||
791 | } | ||
792 | |||
793 | pages[ret] = page; | ||
794 | ret++; | ||
795 | } | ||
796 | rcu_read_unlock(); | ||
760 | return ret; | 797 | return ret; |
761 | } | 798 | } |
762 | 799 | ||
@@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
777 | { | 814 | { |
778 | unsigned int i; | 815 | unsigned int i; |
779 | unsigned int ret; | 816 | unsigned int ret; |
817 | unsigned int nr_found; | ||
818 | |||
819 | rcu_read_lock(); | ||
820 | restart: | ||
821 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
822 | (void ***)pages, index, nr_pages); | ||
823 | ret = 0; | ||
824 | for (i = 0; i < nr_found; i++) { | ||
825 | struct page *page; | ||
826 | repeat: | ||
827 | page = radix_tree_deref_slot((void **)pages[i]); | ||
828 | if (unlikely(!page)) | ||
829 | continue; | ||
830 | /* | ||
831 | * this can only trigger if nr_found == 1, making livelock | ||
832 | * a non issue. | ||
833 | */ | ||
834 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
835 | goto restart; | ||
780 | 836 | ||
781 | read_lock_irq(&mapping->tree_lock); | 837 | if (page->mapping == NULL || page->index != index) |
782 | ret = radix_tree_gang_lookup(&mapping->page_tree, | ||
783 | (void **)pages, index, nr_pages); | ||
784 | for (i = 0; i < ret; i++) { | ||
785 | if (pages[i]->mapping == NULL || pages[i]->index != index) | ||
786 | break; | 838 | break; |
787 | 839 | ||
788 | page_cache_get(pages[i]); | 840 | if (!page_cache_get_speculative(page)) |
841 | goto repeat; | ||
842 | |||
843 | /* Has the page moved? */ | ||
844 | if (unlikely(page != *((void **)pages[i]))) { | ||
845 | page_cache_release(page); | ||
846 | goto repeat; | ||
847 | } | ||
848 | |||
849 | pages[ret] = page; | ||
850 | ret++; | ||
789 | index++; | 851 | index++; |
790 | } | 852 | } |
791 | read_unlock_irq(&mapping->tree_lock); | 853 | rcu_read_unlock(); |
792 | return i; | 854 | return ret; |
793 | } | 855 | } |
794 | EXPORT_SYMBOL(find_get_pages_contig); | 856 | EXPORT_SYMBOL(find_get_pages_contig); |
795 | 857 | ||
@@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
809 | { | 871 | { |
810 | unsigned int i; | 872 | unsigned int i; |
811 | unsigned int ret; | 873 | unsigned int ret; |
874 | unsigned int nr_found; | ||
875 | |||
876 | rcu_read_lock(); | ||
877 | restart: | ||
878 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, | ||
879 | (void ***)pages, *index, nr_pages, tag); | ||
880 | ret = 0; | ||
881 | for (i = 0; i < nr_found; i++) { | ||
882 | struct page *page; | ||
883 | repeat: | ||
884 | page = radix_tree_deref_slot((void **)pages[i]); | ||
885 | if (unlikely(!page)) | ||
886 | continue; | ||
887 | /* | ||
888 | * this can only trigger if nr_found == 1, making livelock | ||
889 | * a non issue. | ||
890 | */ | ||
891 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
892 | goto restart; | ||
893 | |||
894 | if (!page_cache_get_speculative(page)) | ||
895 | goto repeat; | ||
896 | |||
897 | /* Has the page moved? */ | ||
898 | if (unlikely(page != *((void **)pages[i]))) { | ||
899 | page_cache_release(page); | ||
900 | goto repeat; | ||
901 | } | ||
902 | |||
903 | pages[ret] = page; | ||
904 | ret++; | ||
905 | } | ||
906 | rcu_read_unlock(); | ||
812 | 907 | ||
813 | read_lock_irq(&mapping->tree_lock); | ||
814 | ret = radix_tree_gang_lookup_tag(&mapping->page_tree, | ||
815 | (void **)pages, *index, nr_pages, tag); | ||
816 | for (i = 0; i < ret; i++) | ||
817 | page_cache_get(pages[i]); | ||
818 | if (ret) | 908 | if (ret) |
819 | *index = pages[ret - 1]->index + 1; | 909 | *index = pages[ret - 1]->index + 1; |
820 | read_unlock_irq(&mapping->tree_lock); | 910 | |
821 | return ret; | 911 | return ret; |
822 | } | 912 | } |
823 | EXPORT_SYMBOL(find_get_pages_tag); | 913 | EXPORT_SYMBOL(find_get_pages_tag); |
@@ -841,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
841 | struct page *page = find_get_page(mapping, index); | 931 | struct page *page = find_get_page(mapping, index); |
842 | 932 | ||
843 | if (page) { | 933 | if (page) { |
844 | if (!TestSetPageLocked(page)) | 934 | if (trylock_page(page)) |
845 | return page; | 935 | return page; |
846 | page_cache_release(page); | 936 | page_cache_release(page); |
847 | return NULL; | 937 | return NULL; |
@@ -933,8 +1023,17 @@ find_page: | |||
933 | ra, filp, page, | 1023 | ra, filp, page, |
934 | index, last_index - index); | 1024 | index, last_index - index); |
935 | } | 1025 | } |
936 | if (!PageUptodate(page)) | 1026 | if (!PageUptodate(page)) { |
937 | goto page_not_up_to_date; | 1027 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
1028 | !mapping->a_ops->is_partially_uptodate) | ||
1029 | goto page_not_up_to_date; | ||
1030 | if (!trylock_page(page)) | ||
1031 | goto page_not_up_to_date; | ||
1032 | if (!mapping->a_ops->is_partially_uptodate(page, | ||
1033 | desc, offset)) | ||
1034 | goto page_not_up_to_date_locked; | ||
1035 | unlock_page(page); | ||
1036 | } | ||
938 | page_ok: | 1037 | page_ok: |
939 | /* | 1038 | /* |
940 | * i_size must be checked after we know the page is Uptodate. | 1039 | * i_size must be checked after we know the page is Uptodate. |
@@ -1004,6 +1103,7 @@ page_not_up_to_date: | |||
1004 | if (lock_page_killable(page)) | 1103 | if (lock_page_killable(page)) |
1005 | goto readpage_eio; | 1104 | goto readpage_eio; |
1006 | 1105 | ||
1106 | page_not_up_to_date_locked: | ||
1007 | /* Did it get truncated before we got the lock? */ | 1107 | /* Did it get truncated before we got the lock? */ |
1008 | if (!page->mapping) { | 1108 | if (!page->mapping) { |
1009 | unlock_page(page); | 1109 | unlock_page(page); |
@@ -1200,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1200 | 1300 | ||
1201 | mapping = filp->f_mapping; | 1301 | mapping = filp->f_mapping; |
1202 | inode = mapping->host; | 1302 | inode = mapping->host; |
1203 | retval = 0; | ||
1204 | if (!count) | 1303 | if (!count) |
1205 | goto out; /* skip atime */ | 1304 | goto out; /* skip atime */ |
1206 | size = i_size_read(inode); | 1305 | size = i_size_read(inode); |
1207 | if (pos < size) { | 1306 | if (pos < size) { |
1208 | retval = generic_file_direct_IO(READ, iocb, | 1307 | retval = filemap_write_and_wait(mapping); |
1209 | iov, pos, nr_segs); | 1308 | if (!retval) { |
1309 | retval = mapping->a_ops->direct_IO(READ, iocb, | ||
1310 | iov, pos, nr_segs); | ||
1311 | } | ||
1210 | if (retval > 0) | 1312 | if (retval > 0) |
1211 | *ppos = pos + retval; | 1313 | *ppos = pos + retval; |
1212 | } | 1314 | if (retval) { |
1213 | if (likely(retval != 0)) { | 1315 | file_accessed(filp); |
1214 | file_accessed(filp); | 1316 | goto out; |
1215 | goto out; | 1317 | } |
1216 | } | 1318 | } |
1217 | } | 1319 | } |
1218 | 1320 | ||
1219 | retval = 0; | 1321 | for (seg = 0; seg < nr_segs; seg++) { |
1220 | if (count) { | 1322 | read_descriptor_t desc; |
1221 | for (seg = 0; seg < nr_segs; seg++) { | ||
1222 | read_descriptor_t desc; | ||
1223 | 1323 | ||
1224 | desc.written = 0; | 1324 | desc.written = 0; |
1225 | desc.arg.buf = iov[seg].iov_base; | 1325 | desc.arg.buf = iov[seg].iov_base; |
1226 | desc.count = iov[seg].iov_len; | 1326 | desc.count = iov[seg].iov_len; |
1227 | if (desc.count == 0) | 1327 | if (desc.count == 0) |
1228 | continue; | 1328 | continue; |
1229 | desc.error = 0; | 1329 | desc.error = 0; |
1230 | do_generic_file_read(filp,ppos,&desc,file_read_actor); | 1330 | do_generic_file_read(filp, ppos, &desc, file_read_actor); |
1231 | retval += desc.written; | 1331 | retval += desc.written; |
1232 | if (desc.error) { | 1332 | if (desc.error) { |
1233 | retval = retval ?: desc.error; | 1333 | retval = retval ?: desc.error; |
1234 | break; | 1334 | break; |
1235 | } | ||
1236 | if (desc.count > 0) | ||
1237 | break; | ||
1238 | } | 1335 | } |
1336 | if (desc.count > 0) | ||
1337 | break; | ||
1239 | } | 1338 | } |
1240 | out: | 1339 | out: |
1241 | return retval; | 1340 | return retval; |
@@ -1669,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill) | |||
1669 | return notify_change(dentry, &newattrs); | 1768 | return notify_change(dentry, &newattrs); |
1670 | } | 1769 | } |
1671 | 1770 | ||
1672 | int remove_suid(struct dentry *dentry) | 1771 | int file_remove_suid(struct file *file) |
1673 | { | 1772 | { |
1773 | struct dentry *dentry = file->f_path.dentry; | ||
1674 | int killsuid = should_remove_suid(dentry); | 1774 | int killsuid = should_remove_suid(dentry); |
1675 | int killpriv = security_inode_need_killpriv(dentry); | 1775 | int killpriv = security_inode_need_killpriv(dentry); |
1676 | int error = 0; | 1776 | int error = 0; |
@@ -1684,7 +1784,7 @@ int remove_suid(struct dentry *dentry) | |||
1684 | 1784 | ||
1685 | return error; | 1785 | return error; |
1686 | } | 1786 | } |
1687 | EXPORT_SYMBOL(remove_suid); | 1787 | EXPORT_SYMBOL(file_remove_suid); |
1688 | 1788 | ||
1689 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1789 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
1690 | const struct iovec *iov, size_t base, size_t bytes) | 1790 | const struct iovec *iov, size_t base, size_t bytes) |
@@ -1779,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
1779 | * The !iov->iov_len check ensures we skip over unlikely | 1879 | * The !iov->iov_len check ensures we skip over unlikely |
1780 | * zero-length segments (without overruning the iovec). | 1880 | * zero-length segments (without overruning the iovec). |
1781 | */ | 1881 | */ |
1782 | while (bytes || unlikely(!iov->iov_len && i->count)) { | 1882 | while (bytes || unlikely(i->count && !iov->iov_len)) { |
1783 | int copy; | 1883 | int copy; |
1784 | 1884 | ||
1785 | copy = min(bytes, iov->iov_len - base); | 1885 | copy = min(bytes, iov->iov_len - base); |
@@ -2004,11 +2104,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2004 | struct address_space *mapping = file->f_mapping; | 2104 | struct address_space *mapping = file->f_mapping; |
2005 | struct inode *inode = mapping->host; | 2105 | struct inode *inode = mapping->host; |
2006 | ssize_t written; | 2106 | ssize_t written; |
2107 | size_t write_len; | ||
2108 | pgoff_t end; | ||
2007 | 2109 | ||
2008 | if (count != ocount) | 2110 | if (count != ocount) |
2009 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2111 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
2010 | 2112 | ||
2011 | written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); | 2113 | /* |
2114 | * Unmap all mmappings of the file up-front. | ||
2115 | * | ||
2116 | * This will cause any pte dirty bits to be propagated into the | ||
2117 | * pageframes for the subsequent filemap_write_and_wait(). | ||
2118 | */ | ||
2119 | write_len = iov_length(iov, *nr_segs); | ||
2120 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
2121 | if (mapping_mapped(mapping)) | ||
2122 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
2123 | |||
2124 | written = filemap_write_and_wait(mapping); | ||
2125 | if (written) | ||
2126 | goto out; | ||
2127 | |||
2128 | /* | ||
2129 | * After a write we want buffered reads to be sure to go to disk to get | ||
2130 | * the new data. We invalidate clean cached page from the region we're | ||
2131 | * about to write. We do this *before* the write so that we can return | ||
2132 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
2133 | */ | ||
2134 | if (mapping->nrpages) { | ||
2135 | written = invalidate_inode_pages2_range(mapping, | ||
2136 | pos >> PAGE_CACHE_SHIFT, end); | ||
2137 | if (written) | ||
2138 | goto out; | ||
2139 | } | ||
2140 | |||
2141 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); | ||
2142 | |||
2143 | /* | ||
2144 | * Finally, try again to invalidate clean pages which might have been | ||
2145 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
2146 | * if the source of the write was an mmap'ed region of the file | ||
2147 | * we're writing. Either one is a pretty crazy thing to do, | ||
2148 | * so we don't support it 100%. If this invalidation | ||
2149 | * fails, tough, the write still worked... | ||
2150 | */ | ||
2151 | if (mapping->nrpages) { | ||
2152 | invalidate_inode_pages2_range(mapping, | ||
2153 | pos >> PAGE_CACHE_SHIFT, end); | ||
2154 | } | ||
2155 | |||
2012 | if (written > 0) { | 2156 | if (written > 0) { |
2013 | loff_t end = pos + written; | 2157 | loff_t end = pos + written; |
2014 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { | 2158 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { |
@@ -2024,6 +2168,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2024 | * i_mutex is held, which protects generic_osync_inode() from | 2168 | * i_mutex is held, which protects generic_osync_inode() from |
2025 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. | 2169 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
2026 | */ | 2170 | */ |
2171 | out: | ||
2027 | if ((written >= 0 || written == -EIOCBQUEUED) && | 2172 | if ((written >= 0 || written == -EIOCBQUEUED) && |
2028 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2173 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
2029 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2174 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
@@ -2395,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2395 | if (count == 0) | 2540 | if (count == 0) |
2396 | goto out; | 2541 | goto out; |
2397 | 2542 | ||
2398 | err = remove_suid(file->f_path.dentry); | 2543 | err = file_remove_suid(file); |
2399 | if (err) | 2544 | if (err) |
2400 | goto out; | 2545 | goto out; |
2401 | 2546 | ||
@@ -2511,66 +2656,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2511 | } | 2656 | } |
2512 | EXPORT_SYMBOL(generic_file_aio_write); | 2657 | EXPORT_SYMBOL(generic_file_aio_write); |
2513 | 2658 | ||
2514 | /* | ||
2515 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something | ||
2516 | * went wrong during pagecache shootdown. | ||
2517 | */ | ||
2518 | static ssize_t | ||
2519 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
2520 | loff_t offset, unsigned long nr_segs) | ||
2521 | { | ||
2522 | struct file *file = iocb->ki_filp; | ||
2523 | struct address_space *mapping = file->f_mapping; | ||
2524 | ssize_t retval; | ||
2525 | size_t write_len; | ||
2526 | pgoff_t end = 0; /* silence gcc */ | ||
2527 | |||
2528 | /* | ||
2529 | * If it's a write, unmap all mmappings of the file up-front. This | ||
2530 | * will cause any pte dirty bits to be propagated into the pageframes | ||
2531 | * for the subsequent filemap_write_and_wait(). | ||
2532 | */ | ||
2533 | if (rw == WRITE) { | ||
2534 | write_len = iov_length(iov, nr_segs); | ||
2535 | end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
2536 | if (mapping_mapped(mapping)) | ||
2537 | unmap_mapping_range(mapping, offset, write_len, 0); | ||
2538 | } | ||
2539 | |||
2540 | retval = filemap_write_and_wait(mapping); | ||
2541 | if (retval) | ||
2542 | goto out; | ||
2543 | |||
2544 | /* | ||
2545 | * After a write we want buffered reads to be sure to go to disk to get | ||
2546 | * the new data. We invalidate clean cached page from the region we're | ||
2547 | * about to write. We do this *before* the write so that we can return | ||
2548 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
2549 | */ | ||
2550 | if (rw == WRITE && mapping->nrpages) { | ||
2551 | retval = invalidate_inode_pages2_range(mapping, | ||
2552 | offset >> PAGE_CACHE_SHIFT, end); | ||
2553 | if (retval) | ||
2554 | goto out; | ||
2555 | } | ||
2556 | |||
2557 | retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); | ||
2558 | |||
2559 | /* | ||
2560 | * Finally, try again to invalidate clean pages which might have been | ||
2561 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
2562 | * if the source of the write was an mmap'ed region of the file | ||
2563 | * we're writing. Either one is a pretty crazy thing to do, | ||
2564 | * so we don't support it 100%. If this invalidation | ||
2565 | * fails, tough, the write still worked... | ||
2566 | */ | ||
2567 | if (rw == WRITE && mapping->nrpages) { | ||
2568 | invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); | ||
2569 | } | ||
2570 | out: | ||
2571 | return retval; | ||
2572 | } | ||
2573 | |||
2574 | /** | 2659 | /** |
2575 | * try_to_release_page() - release old fs-specific metadata on a page | 2660 | * try_to_release_page() - release old fs-specific metadata on a page |
2576 | * | 2661 | * |
@@ -2582,9 +2667,8 @@ out: | |||
2582 | * Otherwise return zero. | 2667 | * Otherwise return zero. |
2583 | * | 2668 | * |
2584 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2669 | * The @gfp_mask argument specifies whether I/O may be performed to release |
2585 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). | 2670 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
2586 | * | 2671 | * |
2587 | * NOTE: @gfp_mask may go away, and this function may become non-blocking. | ||
2588 | */ | 2672 | */ |
2589 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | 2673 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
2590 | { | 2674 | { |