diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 286 |
1 files changed, 194 insertions, 92 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468..bcdc393b658 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | 36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ |
37 | #include <linux/cleancache.h> | ||
37 | #include "internal.h" | 38 | #include "internal.h" |
38 | 39 | ||
39 | /* | 40 | /* |
@@ -58,16 +59,16 @@ | |||
58 | /* | 59 | /* |
59 | * Lock ordering: | 60 | * Lock ordering: |
60 | * | 61 | * |
61 | * ->i_mmap_lock (truncate_pagecache) | 62 | * ->i_mmap_mutex (truncate_pagecache) |
62 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
63 | * ->swap_lock (exclusive_swap_page, others) | 64 | * ->swap_lock (exclusive_swap_page, others) |
64 | * ->mapping->tree_lock | 65 | * ->mapping->tree_lock |
65 | * | 66 | * |
66 | * ->i_mutex | 67 | * ->i_mutex |
67 | * ->i_mmap_lock (truncate->unmap_mapping_range) | 68 | * ->i_mmap_mutex (truncate->unmap_mapping_range) |
68 | * | 69 | * |
69 | * ->mmap_sem | 70 | * ->mmap_sem |
70 | * ->i_mmap_lock | 71 | * ->i_mmap_mutex |
71 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 72 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
72 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 73 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
73 | * | 74 | * |
@@ -80,11 +81,11 @@ | |||
80 | * ->i_mutex | 81 | * ->i_mutex |
81 | * ->i_alloc_sem (various) | 82 | * ->i_alloc_sem (various) |
82 | * | 83 | * |
83 | * ->inode_lock | 84 | * inode_wb_list_lock |
84 | * ->sb_lock (fs/fs-writeback.c) | 85 | * sb_lock (fs/fs-writeback.c) |
85 | * ->mapping->tree_lock (__sync_single_inode) | 86 | * ->mapping->tree_lock (__sync_single_inode) |
86 | * | 87 | * |
87 | * ->i_mmap_lock | 88 | * ->i_mmap_mutex |
88 | * ->anon_vma.lock (vma_adjust) | 89 | * ->anon_vma.lock (vma_adjust) |
89 | * | 90 | * |
90 | * ->anon_vma.lock | 91 | * ->anon_vma.lock |
@@ -98,24 +99,36 @@ | |||
98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 99 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
99 | * ->private_lock (page_remove_rmap->set_page_dirty) | 100 | * ->private_lock (page_remove_rmap->set_page_dirty) |
100 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 101 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
101 | * ->inode_lock (page_remove_rmap->set_page_dirty) | 102 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) |
102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 103 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
104 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | ||
105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | ||
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 107 | * |
105 | * (code doesn't rely on that order, so you could switch it around) | 108 | * (code doesn't rely on that order, so you could switch it around) |
106 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
107 | * ->i_mmap_lock | 110 | * ->i_mmap_mutex |
108 | */ | 111 | */ |
109 | 112 | ||
110 | /* | 113 | /* |
111 | * Remove a page from the page cache and free it. Caller has to make | 114 | * Delete a page from the page cache and free it. Caller has to make |
112 | * sure the page is locked and that nobody else uses it - or that usage | 115 | * sure the page is locked and that nobody else uses it - or that usage |
113 | * is safe. The caller must hold the mapping's tree_lock. | 116 | * is safe. The caller must hold the mapping's tree_lock. |
114 | */ | 117 | */ |
115 | void __remove_from_page_cache(struct page *page) | 118 | void __delete_from_page_cache(struct page *page) |
116 | { | 119 | { |
117 | struct address_space *mapping = page->mapping; | 120 | struct address_space *mapping = page->mapping; |
118 | 121 | ||
122 | /* | ||
123 | * if we're uptodate, flush out into the cleancache, otherwise | ||
124 | * invalidate any existing cleancache entries. We can't leave | ||
125 | * stale data around in the cleancache once our page is gone | ||
126 | */ | ||
127 | if (PageUptodate(page) && PageMappedToDisk(page)) | ||
128 | cleancache_put_page(page); | ||
129 | else | ||
130 | cleancache_flush_page(mapping, page); | ||
131 | |||
119 | radix_tree_delete(&mapping->page_tree, page->index); | 132 | radix_tree_delete(&mapping->page_tree, page->index); |
120 | page->mapping = NULL; | 133 | page->mapping = NULL; |
121 | mapping->nrpages--; | 134 | mapping->nrpages--; |
@@ -137,7 +150,15 @@ void __remove_from_page_cache(struct page *page) | |||
137 | } | 150 | } |
138 | } | 151 | } |
139 | 152 | ||
140 | void remove_from_page_cache(struct page *page) | 153 | /** |
154 | * delete_from_page_cache - delete page from page cache | ||
155 | * @page: the page which the kernel is trying to remove from page cache | ||
156 | * | ||
157 | * This must be called only on pages that have been verified to be in the page | ||
158 | * cache and locked. It will never put the page into the free list, the caller | ||
159 | * has a reference on the page. | ||
160 | */ | ||
161 | void delete_from_page_cache(struct page *page) | ||
141 | { | 162 | { |
142 | struct address_space *mapping = page->mapping; | 163 | struct address_space *mapping = page->mapping; |
143 | void (*freepage)(struct page *); | 164 | void (*freepage)(struct page *); |
@@ -146,54 +167,25 @@ void remove_from_page_cache(struct page *page) | |||
146 | 167 | ||
147 | freepage = mapping->a_ops->freepage; | 168 | freepage = mapping->a_ops->freepage; |
148 | spin_lock_irq(&mapping->tree_lock); | 169 | spin_lock_irq(&mapping->tree_lock); |
149 | __remove_from_page_cache(page); | 170 | __delete_from_page_cache(page); |
150 | spin_unlock_irq(&mapping->tree_lock); | 171 | spin_unlock_irq(&mapping->tree_lock); |
151 | mem_cgroup_uncharge_cache_page(page); | 172 | mem_cgroup_uncharge_cache_page(page); |
152 | 173 | ||
153 | if (freepage) | 174 | if (freepage) |
154 | freepage(page); | 175 | freepage(page); |
176 | page_cache_release(page); | ||
155 | } | 177 | } |
156 | EXPORT_SYMBOL(remove_from_page_cache); | 178 | EXPORT_SYMBOL(delete_from_page_cache); |
157 | 179 | ||
158 | static int sync_page(void *word) | 180 | static int sleep_on_page(void *word) |
159 | { | 181 | { |
160 | struct address_space *mapping; | ||
161 | struct page *page; | ||
162 | |||
163 | page = container_of((unsigned long *)word, struct page, flags); | ||
164 | |||
165 | /* | ||
166 | * page_mapping() is being called without PG_locked held. | ||
167 | * Some knowledge of the state and use of the page is used to | ||
168 | * reduce the requirements down to a memory barrier. | ||
169 | * The danger here is of a stale page_mapping() return value | ||
170 | * indicating a struct address_space different from the one it's | ||
171 | * associated with when it is associated with one. | ||
172 | * After smp_mb(), it's either the correct page_mapping() for | ||
173 | * the page, or an old page_mapping() and the page's own | ||
174 | * page_mapping() has gone NULL. | ||
175 | * The ->sync_page() address_space operation must tolerate | ||
176 | * page_mapping() going NULL. By an amazing coincidence, | ||
177 | * this comes about because none of the users of the page | ||
178 | * in the ->sync_page() methods make essential use of the | ||
179 | * page_mapping(), merely passing the page down to the backing | ||
180 | * device's unplug functions when it's non-NULL, which in turn | ||
181 | * ignore it for all cases but swap, where only page_private(page) is | ||
182 | * of interest. When page_mapping() does go NULL, the entire | ||
183 | * call stack gracefully ignores the page and returns. | ||
184 | * -- wli | ||
185 | */ | ||
186 | smp_mb(); | ||
187 | mapping = page_mapping(page); | ||
188 | if (mapping && mapping->a_ops && mapping->a_ops->sync_page) | ||
189 | mapping->a_ops->sync_page(page); | ||
190 | io_schedule(); | 182 | io_schedule(); |
191 | return 0; | 183 | return 0; |
192 | } | 184 | } |
193 | 185 | ||
194 | static int sync_page_killable(void *word) | 186 | static int sleep_on_page_killable(void *word) |
195 | { | 187 | { |
196 | sync_page(word); | 188 | sleep_on_page(word); |
197 | return fatal_signal_pending(current) ? -EINTR : 0; | 189 | return fatal_signal_pending(current) ? -EINTR : 0; |
198 | } | 190 | } |
199 | 191 | ||
@@ -387,6 +379,76 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
387 | EXPORT_SYMBOL(filemap_write_and_wait_range); | 379 | EXPORT_SYMBOL(filemap_write_and_wait_range); |
388 | 380 | ||
389 | /** | 381 | /** |
382 | * replace_page_cache_page - replace a pagecache page with a new one | ||
383 | * @old: page to be replaced | ||
384 | * @new: page to replace with | ||
385 | * @gfp_mask: allocation mode | ||
386 | * | ||
387 | * This function replaces a page in the pagecache with a new one. On | ||
388 | * success it acquires the pagecache reference for the new page and | ||
389 | * drops it for the old page. Both the old and new pages must be | ||
390 | * locked. This function does not add the new page to the LRU, the | ||
391 | * caller must do that. | ||
392 | * | ||
393 | * The remove + add is atomic. The only way this function can fail is | ||
394 | * memory allocation failure. | ||
395 | */ | ||
396 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | ||
397 | { | ||
398 | int error; | ||
399 | struct mem_cgroup *memcg = NULL; | ||
400 | |||
401 | VM_BUG_ON(!PageLocked(old)); | ||
402 | VM_BUG_ON(!PageLocked(new)); | ||
403 | VM_BUG_ON(new->mapping); | ||
404 | |||
405 | /* | ||
406 | * This is not page migration, but prepare_migration and | ||
407 | * end_migration does enough work for charge replacement. | ||
408 | * | ||
409 | * In the longer term we probably want a specialized function | ||
410 | * for moving the charge from old to new in a more efficient | ||
411 | * manner. | ||
412 | */ | ||
413 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | ||
414 | if (error) | ||
415 | return error; | ||
416 | |||
417 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | ||
418 | if (!error) { | ||
419 | struct address_space *mapping = old->mapping; | ||
420 | void (*freepage)(struct page *); | ||
421 | |||
422 | pgoff_t offset = old->index; | ||
423 | freepage = mapping->a_ops->freepage; | ||
424 | |||
425 | page_cache_get(new); | ||
426 | new->mapping = mapping; | ||
427 | new->index = offset; | ||
428 | |||
429 | spin_lock_irq(&mapping->tree_lock); | ||
430 | __delete_from_page_cache(old); | ||
431 | error = radix_tree_insert(&mapping->page_tree, offset, new); | ||
432 | BUG_ON(error); | ||
433 | mapping->nrpages++; | ||
434 | __inc_zone_page_state(new, NR_FILE_PAGES); | ||
435 | if (PageSwapBacked(new)) | ||
436 | __inc_zone_page_state(new, NR_SHMEM); | ||
437 | spin_unlock_irq(&mapping->tree_lock); | ||
438 | radix_tree_preload_end(); | ||
439 | if (freepage) | ||
440 | freepage(old); | ||
441 | page_cache_release(old); | ||
442 | mem_cgroup_end_migration(memcg, old, new, true); | ||
443 | } else { | ||
444 | mem_cgroup_end_migration(memcg, old, new, false); | ||
445 | } | ||
446 | |||
447 | return error; | ||
448 | } | ||
449 | EXPORT_SYMBOL_GPL(replace_page_cache_page); | ||
450 | |||
451 | /** | ||
390 | * add_to_page_cache_locked - add a locked page to the pagecache | 452 | * add_to_page_cache_locked - add a locked page to the pagecache |
391 | * @page: page to add | 453 | * @page: page to add |
392 | * @mapping: the page's address_space | 454 | * @mapping: the page's address_space |
@@ -479,12 +541,6 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
479 | EXPORT_SYMBOL(__page_cache_alloc); | 541 | EXPORT_SYMBOL(__page_cache_alloc); |
480 | #endif | 542 | #endif |
481 | 543 | ||
482 | static int __sleep_on_page_lock(void *word) | ||
483 | { | ||
484 | io_schedule(); | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | /* | 544 | /* |
489 | * In order to wait for pages to become available there must be | 545 | * In order to wait for pages to become available there must be |
490 | * waitqueues associated with pages. By using a hash table of | 546 | * waitqueues associated with pages. By using a hash table of |
@@ -512,11 +568,22 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
512 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 568 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
513 | 569 | ||
514 | if (test_bit(bit_nr, &page->flags)) | 570 | if (test_bit(bit_nr, &page->flags)) |
515 | __wait_on_bit(page_waitqueue(page), &wait, sync_page, | 571 | __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, |
516 | TASK_UNINTERRUPTIBLE); | 572 | TASK_UNINTERRUPTIBLE); |
517 | } | 573 | } |
518 | EXPORT_SYMBOL(wait_on_page_bit); | 574 | EXPORT_SYMBOL(wait_on_page_bit); |
519 | 575 | ||
576 | int wait_on_page_bit_killable(struct page *page, int bit_nr) | ||
577 | { | ||
578 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | ||
579 | |||
580 | if (!test_bit(bit_nr, &page->flags)) | ||
581 | return 0; | ||
582 | |||
583 | return __wait_on_bit(page_waitqueue(page), &wait, | ||
584 | sleep_on_page_killable, TASK_KILLABLE); | ||
585 | } | ||
586 | |||
520 | /** | 587 | /** |
521 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | 588 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue |
522 | * @page: Page defining the wait queue of interest | 589 | * @page: Page defining the wait queue of interest |
@@ -576,17 +643,12 @@ EXPORT_SYMBOL(end_page_writeback); | |||
576 | /** | 643 | /** |
577 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 644 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
578 | * @page: the page to lock | 645 | * @page: the page to lock |
579 | * | ||
580 | * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | ||
581 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | ||
582 | * chances are that on the second loop, the block layer's plug list is empty, | ||
583 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | ||
584 | */ | 646 | */ |
585 | void __lock_page(struct page *page) | 647 | void __lock_page(struct page *page) |
586 | { | 648 | { |
587 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 649 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
588 | 650 | ||
589 | __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, | 651 | __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, |
590 | TASK_UNINTERRUPTIBLE); | 652 | TASK_UNINTERRUPTIBLE); |
591 | } | 653 | } |
592 | EXPORT_SYMBOL(__lock_page); | 654 | EXPORT_SYMBOL(__lock_page); |
@@ -596,34 +658,39 @@ int __lock_page_killable(struct page *page) | |||
596 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 658 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
597 | 659 | ||
598 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 660 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
599 | sync_page_killable, TASK_KILLABLE); | 661 | sleep_on_page_killable, TASK_KILLABLE); |
600 | } | 662 | } |
601 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 663 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
602 | 664 | ||
603 | /** | ||
604 | * __lock_page_nosync - get a lock on the page, without calling sync_page() | ||
605 | * @page: the page to lock | ||
606 | * | ||
607 | * Variant of lock_page that does not require the caller to hold a reference | ||
608 | * on the page's mapping. | ||
609 | */ | ||
610 | void __lock_page_nosync(struct page *page) | ||
611 | { | ||
612 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
613 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | ||
614 | TASK_UNINTERRUPTIBLE); | ||
615 | } | ||
616 | |||
617 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 665 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
618 | unsigned int flags) | 666 | unsigned int flags) |
619 | { | 667 | { |
620 | if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { | 668 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
621 | __lock_page(page); | 669 | /* |
622 | return 1; | 670 | * CAUTION! In this case, mmap_sem is not released |
623 | } else { | 671 | * even though return 0. |
672 | */ | ||
673 | if (flags & FAULT_FLAG_RETRY_NOWAIT) | ||
674 | return 0; | ||
675 | |||
624 | up_read(&mm->mmap_sem); | 676 | up_read(&mm->mmap_sem); |
625 | wait_on_page_locked(page); | 677 | if (flags & FAULT_FLAG_KILLABLE) |
678 | wait_on_page_locked_killable(page); | ||
679 | else | ||
680 | wait_on_page_locked(page); | ||
626 | return 0; | 681 | return 0; |
682 | } else { | ||
683 | if (flags & FAULT_FLAG_KILLABLE) { | ||
684 | int ret; | ||
685 | |||
686 | ret = __lock_page_killable(page); | ||
687 | if (ret) { | ||
688 | up_read(&mm->mmap_sem); | ||
689 | return 0; | ||
690 | } | ||
691 | } else | ||
692 | __lock_page(page); | ||
693 | return 1; | ||
627 | } | 694 | } |
628 | } | 695 | } |
629 | 696 | ||
@@ -782,9 +849,13 @@ repeat: | |||
782 | page = radix_tree_deref_slot((void **)pages[i]); | 849 | page = radix_tree_deref_slot((void **)pages[i]); |
783 | if (unlikely(!page)) | 850 | if (unlikely(!page)) |
784 | continue; | 851 | continue; |
852 | |||
853 | /* | ||
854 | * This can only trigger when the entry at index 0 moves out | ||
855 | * of or back to the root: none yet gotten, safe to restart. | ||
856 | */ | ||
785 | if (radix_tree_deref_retry(page)) { | 857 | if (radix_tree_deref_retry(page)) { |
786 | if (ret) | 858 | WARN_ON(start | i); |
787 | start = pages[ret-1]->index; | ||
788 | goto restart; | 859 | goto restart; |
789 | } | 860 | } |
790 | 861 | ||
@@ -800,6 +871,13 @@ repeat: | |||
800 | pages[ret] = page; | 871 | pages[ret] = page; |
801 | ret++; | 872 | ret++; |
802 | } | 873 | } |
874 | |||
875 | /* | ||
876 | * If all entries were removed before we could secure them, | ||
877 | * try again, because callers stop trying once 0 is returned. | ||
878 | */ | ||
879 | if (unlikely(!ret && nr_found)) | ||
880 | goto restart; | ||
803 | rcu_read_unlock(); | 881 | rcu_read_unlock(); |
804 | return ret; | 882 | return ret; |
805 | } | 883 | } |
@@ -834,6 +912,11 @@ repeat: | |||
834 | page = radix_tree_deref_slot((void **)pages[i]); | 912 | page = radix_tree_deref_slot((void **)pages[i]); |
835 | if (unlikely(!page)) | 913 | if (unlikely(!page)) |
836 | continue; | 914 | continue; |
915 | |||
916 | /* | ||
917 | * This can only trigger when the entry at index 0 moves out | ||
918 | * of or back to the root: none yet gotten, safe to restart. | ||
919 | */ | ||
837 | if (radix_tree_deref_retry(page)) | 920 | if (radix_tree_deref_retry(page)) |
838 | goto restart; | 921 | goto restart; |
839 | 922 | ||
@@ -894,6 +977,11 @@ repeat: | |||
894 | page = radix_tree_deref_slot((void **)pages[i]); | 977 | page = radix_tree_deref_slot((void **)pages[i]); |
895 | if (unlikely(!page)) | 978 | if (unlikely(!page)) |
896 | continue; | 979 | continue; |
980 | |||
981 | /* | ||
982 | * This can only trigger when the entry at index 0 moves out | ||
983 | * of or back to the root: none yet gotten, safe to restart. | ||
984 | */ | ||
897 | if (radix_tree_deref_retry(page)) | 985 | if (radix_tree_deref_retry(page)) |
898 | goto restart; | 986 | goto restart; |
899 | 987 | ||
@@ -909,6 +997,13 @@ repeat: | |||
909 | pages[ret] = page; | 997 | pages[ret] = page; |
910 | ret++; | 998 | ret++; |
911 | } | 999 | } |
1000 | |||
1001 | /* | ||
1002 | * If all entries were removed before we could secure them, | ||
1003 | * try again, because callers stop trying once 0 is returned. | ||
1004 | */ | ||
1005 | if (unlikely(!ret && nr_found)) | ||
1006 | goto restart; | ||
912 | rcu_read_unlock(); | 1007 | rcu_read_unlock(); |
913 | 1008 | ||
914 | if (ret) | 1009 | if (ret) |
@@ -1298,12 +1393,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1298 | unsigned long seg = 0; | 1393 | unsigned long seg = 0; |
1299 | size_t count; | 1394 | size_t count; |
1300 | loff_t *ppos = &iocb->ki_pos; | 1395 | loff_t *ppos = &iocb->ki_pos; |
1396 | struct blk_plug plug; | ||
1301 | 1397 | ||
1302 | count = 0; | 1398 | count = 0; |
1303 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1399 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1304 | if (retval) | 1400 | if (retval) |
1305 | return retval; | 1401 | return retval; |
1306 | 1402 | ||
1403 | blk_start_plug(&plug); | ||
1404 | |||
1307 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1405 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1308 | if (filp->f_flags & O_DIRECT) { | 1406 | if (filp->f_flags & O_DIRECT) { |
1309 | loff_t size; | 1407 | loff_t size; |
@@ -1376,6 +1474,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1376 | break; | 1474 | break; |
1377 | } | 1475 | } |
1378 | out: | 1476 | out: |
1477 | blk_finish_plug(&plug); | ||
1379 | return retval; | 1478 | return retval; |
1380 | } | 1479 | } |
1381 | EXPORT_SYMBOL(generic_file_aio_read); | 1480 | EXPORT_SYMBOL(generic_file_aio_read); |
@@ -1468,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1468 | /* If we don't want any read-ahead, don't bother */ | 1567 | /* If we don't want any read-ahead, don't bother */ |
1469 | if (VM_RandomReadHint(vma)) | 1568 | if (VM_RandomReadHint(vma)) |
1470 | return; | 1569 | return; |
1570 | if (!ra->ra_pages) | ||
1571 | return; | ||
1471 | 1572 | ||
1472 | if (VM_SequentialReadHint(vma) || | 1573 | if (VM_SequentialReadHint(vma)) { |
1473 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
1474 | page_cache_sync_readahead(mapping, ra, file, offset, | 1574 | page_cache_sync_readahead(mapping, ra, file, offset, |
1475 | ra->ra_pages); | 1575 | ra->ra_pages); |
1476 | return; | 1576 | return; |
1477 | } | 1577 | } |
1478 | 1578 | ||
1479 | if (ra->mmap_miss < INT_MAX) | 1579 | /* Avoid banging the cache line if not needed */ |
1580 | if (ra->mmap_miss < MMAP_LOTSAMISS * 10) | ||
1480 | ra->mmap_miss++; | 1581 | ra->mmap_miss++; |
1481 | 1582 | ||
1482 | /* | 1583 | /* |
@@ -1490,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1490 | * mmap read-around | 1591 | * mmap read-around |
1491 | */ | 1592 | */ |
1492 | ra_pages = max_sane_readahead(ra->ra_pages); | 1593 | ra_pages = max_sane_readahead(ra->ra_pages); |
1493 | if (ra_pages) { | 1594 | ra->start = max_t(long, 0, offset - ra_pages / 2); |
1494 | ra->start = max_t(long, 0, offset - ra_pages/2); | 1595 | ra->size = ra_pages; |
1495 | ra->size = ra_pages; | 1596 | ra->async_size = ra_pages / 4; |
1496 | ra->async_size = 0; | 1597 | ra_submit(ra, mapping, file); |
1497 | ra_submit(ra, mapping, file); | ||
1498 | } | ||
1499 | } | 1598 | } |
1500 | 1599 | ||
1501 | /* | 1600 | /* |
@@ -1562,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1562 | /* No page in the page cache at all */ | 1661 | /* No page in the page cache at all */ |
1563 | do_sync_mmap_readahead(vma, ra, file, offset); | 1662 | do_sync_mmap_readahead(vma, ra, file, offset); |
1564 | count_vm_event(PGMAJFAULT); | 1663 | count_vm_event(PGMAJFAULT); |
1664 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | ||
1565 | ret = VM_FAULT_MAJOR; | 1665 | ret = VM_FAULT_MAJOR; |
1566 | retry_find: | 1666 | retry_find: |
1567 | page = find_get_page(mapping, offset); | 1667 | page = find_get_page(mapping, offset); |
@@ -1600,7 +1700,6 @@ retry_find: | |||
1600 | return VM_FAULT_SIGBUS; | 1700 | return VM_FAULT_SIGBUS; |
1601 | } | 1701 | } |
1602 | 1702 | ||
1603 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; | ||
1604 | vmf->page = page; | 1703 | vmf->page = page; |
1605 | return ret | VM_FAULT_LOCKED; | 1704 | return ret | VM_FAULT_LOCKED; |
1606 | 1705 | ||
@@ -2487,11 +2586,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2487 | { | 2586 | { |
2488 | struct file *file = iocb->ki_filp; | 2587 | struct file *file = iocb->ki_filp; |
2489 | struct inode *inode = file->f_mapping->host; | 2588 | struct inode *inode = file->f_mapping->host; |
2589 | struct blk_plug plug; | ||
2490 | ssize_t ret; | 2590 | ssize_t ret; |
2491 | 2591 | ||
2492 | BUG_ON(iocb->ki_pos != pos); | 2592 | BUG_ON(iocb->ki_pos != pos); |
2493 | 2593 | ||
2494 | mutex_lock(&inode->i_mutex); | 2594 | mutex_lock(&inode->i_mutex); |
2595 | blk_start_plug(&plug); | ||
2495 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2596 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2496 | mutex_unlock(&inode->i_mutex); | 2597 | mutex_unlock(&inode->i_mutex); |
2497 | 2598 | ||
@@ -2502,6 +2603,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2502 | if (err < 0 && ret > 0) | 2603 | if (err < 0 && ret > 0) |
2503 | ret = err; | 2604 | ret = err; |
2504 | } | 2605 | } |
2606 | blk_finish_plug(&plug); | ||
2505 | return ret; | 2607 | return ret; |
2506 | } | 2608 | } |
2507 | EXPORT_SYMBOL(generic_file_aio_write); | 2609 | EXPORT_SYMBOL(generic_file_aio_write); |