diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 211 |
1 files changed, 137 insertions, 74 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468b..c641edf553a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -80,8 +80,8 @@ | |||
80 | * ->i_mutex | 80 | * ->i_mutex |
81 | * ->i_alloc_sem (various) | 81 | * ->i_alloc_sem (various) |
82 | * | 82 | * |
83 | * ->inode_lock | 83 | * inode_wb_list_lock |
84 | * ->sb_lock (fs/fs-writeback.c) | 84 | * sb_lock (fs/fs-writeback.c) |
85 | * ->mapping->tree_lock (__sync_single_inode) | 85 | * ->mapping->tree_lock (__sync_single_inode) |
86 | * | 86 | * |
87 | * ->i_mmap_lock | 87 | * ->i_mmap_lock |
@@ -98,8 +98,10 @@ | |||
98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
99 | * ->private_lock (page_remove_rmap->set_page_dirty) | 99 | * ->private_lock (page_remove_rmap->set_page_dirty) |
100 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 100 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
101 | * ->inode_lock (page_remove_rmap->set_page_dirty) | 101 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) |
102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
103 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | ||
104 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | ||
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 105 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 106 | * |
105 | * (code doesn't rely on that order, so you could switch it around) | 107 | * (code doesn't rely on that order, so you could switch it around) |
@@ -108,11 +110,11 @@ | |||
108 | */ | 110 | */ |
109 | 111 | ||
110 | /* | 112 | /* |
111 | * Remove a page from the page cache and free it. Caller has to make | 113 | * Delete a page from the page cache and free it. Caller has to make |
112 | * sure the page is locked and that nobody else uses it - or that usage | 114 | * sure the page is locked and that nobody else uses it - or that usage |
113 | * is safe. The caller must hold the mapping's tree_lock. | 115 | * is safe. The caller must hold the mapping's tree_lock. |
114 | */ | 116 | */ |
115 | void __remove_from_page_cache(struct page *page) | 117 | void __delete_from_page_cache(struct page *page) |
116 | { | 118 | { |
117 | struct address_space *mapping = page->mapping; | 119 | struct address_space *mapping = page->mapping; |
118 | 120 | ||
@@ -137,7 +139,15 @@ void __remove_from_page_cache(struct page *page) | |||
137 | } | 139 | } |
138 | } | 140 | } |
139 | 141 | ||
140 | void remove_from_page_cache(struct page *page) | 142 | /** |
143 | * delete_from_page_cache - delete page from page cache | ||
144 | * @page: the page which the kernel is trying to remove from page cache | ||
145 | * | ||
146 | * This must be called only on pages that have been verified to be in the page | ||
147 | * cache and locked. It will never put the page into the free list, the caller | ||
148 | * has a reference on the page. | ||
149 | */ | ||
150 | void delete_from_page_cache(struct page *page) | ||
141 | { | 151 | { |
142 | struct address_space *mapping = page->mapping; | 152 | struct address_space *mapping = page->mapping; |
143 | void (*freepage)(struct page *); | 153 | void (*freepage)(struct page *); |
@@ -146,54 +156,25 @@ void remove_from_page_cache(struct page *page) | |||
146 | 156 | ||
147 | freepage = mapping->a_ops->freepage; | 157 | freepage = mapping->a_ops->freepage; |
148 | spin_lock_irq(&mapping->tree_lock); | 158 | spin_lock_irq(&mapping->tree_lock); |
149 | __remove_from_page_cache(page); | 159 | __delete_from_page_cache(page); |
150 | spin_unlock_irq(&mapping->tree_lock); | 160 | spin_unlock_irq(&mapping->tree_lock); |
151 | mem_cgroup_uncharge_cache_page(page); | 161 | mem_cgroup_uncharge_cache_page(page); |
152 | 162 | ||
153 | if (freepage) | 163 | if (freepage) |
154 | freepage(page); | 164 | freepage(page); |
165 | page_cache_release(page); | ||
155 | } | 166 | } |
156 | EXPORT_SYMBOL(remove_from_page_cache); | 167 | EXPORT_SYMBOL(delete_from_page_cache); |
157 | 168 | ||
158 | static int sync_page(void *word) | 169 | static int sleep_on_page(void *word) |
159 | { | 170 | { |
160 | struct address_space *mapping; | ||
161 | struct page *page; | ||
162 | |||
163 | page = container_of((unsigned long *)word, struct page, flags); | ||
164 | |||
165 | /* | ||
166 | * page_mapping() is being called without PG_locked held. | ||
167 | * Some knowledge of the state and use of the page is used to | ||
168 | * reduce the requirements down to a memory barrier. | ||
169 | * The danger here is of a stale page_mapping() return value | ||
170 | * indicating a struct address_space different from the one it's | ||
171 | * associated with when it is associated with one. | ||
172 | * After smp_mb(), it's either the correct page_mapping() for | ||
173 | * the page, or an old page_mapping() and the page's own | ||
174 | * page_mapping() has gone NULL. | ||
175 | * The ->sync_page() address_space operation must tolerate | ||
176 | * page_mapping() going NULL. By an amazing coincidence, | ||
177 | * this comes about because none of the users of the page | ||
178 | * in the ->sync_page() methods make essential use of the | ||
179 | * page_mapping(), merely passing the page down to the backing | ||
180 | * device's unplug functions when it's non-NULL, which in turn | ||
181 | * ignore it for all cases but swap, where only page_private(page) is | ||
182 | * of interest. When page_mapping() does go NULL, the entire | ||
183 | * call stack gracefully ignores the page and returns. | ||
184 | * -- wli | ||
185 | */ | ||
186 | smp_mb(); | ||
187 | mapping = page_mapping(page); | ||
188 | if (mapping && mapping->a_ops && mapping->a_ops->sync_page) | ||
189 | mapping->a_ops->sync_page(page); | ||
190 | io_schedule(); | 171 | io_schedule(); |
191 | return 0; | 172 | return 0; |
192 | } | 173 | } |
193 | 174 | ||
194 | static int sync_page_killable(void *word) | 175 | static int sleep_on_page_killable(void *word) |
195 | { | 176 | { |
196 | sync_page(word); | 177 | sleep_on_page(word); |
197 | return fatal_signal_pending(current) ? -EINTR : 0; | 178 | return fatal_signal_pending(current) ? -EINTR : 0; |
198 | } | 179 | } |
199 | 180 | ||
@@ -387,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
387 | EXPORT_SYMBOL(filemap_write_and_wait_range); | 368 | EXPORT_SYMBOL(filemap_write_and_wait_range); |
388 | 369 | ||
389 | /** | 370 | /** |
371 | * replace_page_cache_page - replace a pagecache page with a new one | ||
372 | * @old: page to be replaced | ||
373 | * @new: page to replace with | ||
374 | * @gfp_mask: allocation mode | ||
375 | * | ||
376 | * This function replaces a page in the pagecache with a new one. On | ||
377 | * success it acquires the pagecache reference for the new page and | ||
378 | * drops it for the old page. Both the old and new pages must be | ||
379 | * locked. This function does not add the new page to the LRU, the | ||
380 | * caller must do that. | ||
381 | * | ||
382 | * The remove + add is atomic. The only way this function can fail is | ||
383 | * memory allocation failure. | ||
384 | */ | ||
385 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | ||
386 | { | ||
387 | int error; | ||
388 | struct mem_cgroup *memcg = NULL; | ||
389 | |||
390 | VM_BUG_ON(!PageLocked(old)); | ||
391 | VM_BUG_ON(!PageLocked(new)); | ||
392 | VM_BUG_ON(new->mapping); | ||
393 | |||
394 | /* | ||
395 | * This is not page migration, but prepare_migration and | ||
396 | * end_migration does enough work for charge replacement. | ||
397 | * | ||
398 | * In the longer term we probably want a specialized function | ||
399 | * for moving the charge from old to new in a more efficient | ||
400 | * manner. | ||
401 | */ | ||
402 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | ||
403 | if (error) | ||
404 | return error; | ||
405 | |||
406 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | ||
407 | if (!error) { | ||
408 | struct address_space *mapping = old->mapping; | ||
409 | void (*freepage)(struct page *); | ||
410 | |||
411 | pgoff_t offset = old->index; | ||
412 | freepage = mapping->a_ops->freepage; | ||
413 | |||
414 | page_cache_get(new); | ||
415 | new->mapping = mapping; | ||
416 | new->index = offset; | ||
417 | |||
418 | spin_lock_irq(&mapping->tree_lock); | ||
419 | __delete_from_page_cache(old); | ||
420 | error = radix_tree_insert(&mapping->page_tree, offset, new); | ||
421 | BUG_ON(error); | ||
422 | mapping->nrpages++; | ||
423 | __inc_zone_page_state(new, NR_FILE_PAGES); | ||
424 | if (PageSwapBacked(new)) | ||
425 | __inc_zone_page_state(new, NR_SHMEM); | ||
426 | spin_unlock_irq(&mapping->tree_lock); | ||
427 | radix_tree_preload_end(); | ||
428 | if (freepage) | ||
429 | freepage(old); | ||
430 | page_cache_release(old); | ||
431 | mem_cgroup_end_migration(memcg, old, new, true); | ||
432 | } else { | ||
433 | mem_cgroup_end_migration(memcg, old, new, false); | ||
434 | } | ||
435 | |||
436 | return error; | ||
437 | } | ||
438 | EXPORT_SYMBOL_GPL(replace_page_cache_page); | ||
439 | |||
440 | /** | ||
390 | * add_to_page_cache_locked - add a locked page to the pagecache | 441 | * add_to_page_cache_locked - add a locked page to the pagecache |
391 | * @page: page to add | 442 | * @page: page to add |
392 | * @mapping: the page's address_space | 443 | * @mapping: the page's address_space |
@@ -479,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
479 | EXPORT_SYMBOL(__page_cache_alloc); | 530 | EXPORT_SYMBOL(__page_cache_alloc); |
480 | #endif | 531 | #endif |
481 | 532 | ||
482 | static int __sleep_on_page_lock(void *word) | ||
483 | { | ||
484 | io_schedule(); | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | /* | 533 | /* |
489 | * In order to wait for pages to become available there must be | 534 | * In order to wait for pages to become available there must be |
490 | * waitqueues associated with pages. By using a hash table of | 535 | * waitqueues associated with pages. By using a hash table of |
@@ -512,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
512 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 557 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
513 | 558 | ||
514 | if (test_bit(bit_nr, &page->flags)) | 559 | if (test_bit(bit_nr, &page->flags)) |
515 | __wait_on_bit(page_waitqueue(page), &wait, sync_page, | 560 | __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, |
516 | TASK_UNINTERRUPTIBLE); | 561 | TASK_UNINTERRUPTIBLE); |
517 | } | 562 | } |
518 | EXPORT_SYMBOL(wait_on_page_bit); | 563 | EXPORT_SYMBOL(wait_on_page_bit); |
@@ -576,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback); | |||
576 | /** | 621 | /** |
577 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 622 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
578 | * @page: the page to lock | 623 | * @page: the page to lock |
579 | * | ||
580 | * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | ||
581 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | ||
582 | * chances are that on the second loop, the block layer's plug list is empty, | ||
583 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | ||
584 | */ | 624 | */ |
585 | void __lock_page(struct page *page) | 625 | void __lock_page(struct page *page) |
586 | { | 626 | { |
587 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 627 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
588 | 628 | ||
589 | __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, | 629 | __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, |
590 | TASK_UNINTERRUPTIBLE); | 630 | TASK_UNINTERRUPTIBLE); |
591 | } | 631 | } |
592 | EXPORT_SYMBOL(__lock_page); | 632 | EXPORT_SYMBOL(__lock_page); |
@@ -596,24 +636,10 @@ int __lock_page_killable(struct page *page) | |||
596 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 636 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
597 | 637 | ||
598 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 638 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
599 | sync_page_killable, TASK_KILLABLE); | 639 | sleep_on_page_killable, TASK_KILLABLE); |
600 | } | 640 | } |
601 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 641 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
602 | 642 | ||
603 | /** | ||
604 | * __lock_page_nosync - get a lock on the page, without calling sync_page() | ||
605 | * @page: the page to lock | ||
606 | * | ||
607 | * Variant of lock_page that does not require the caller to hold a reference | ||
608 | * on the page's mapping. | ||
609 | */ | ||
610 | void __lock_page_nosync(struct page *page) | ||
611 | { | ||
612 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
613 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | ||
614 | TASK_UNINTERRUPTIBLE); | ||
615 | } | ||
616 | |||
617 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 643 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
618 | unsigned int flags) | 644 | unsigned int flags) |
619 | { | 645 | { |
@@ -621,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | |||
621 | __lock_page(page); | 647 | __lock_page(page); |
622 | return 1; | 648 | return 1; |
623 | } else { | 649 | } else { |
624 | up_read(&mm->mmap_sem); | 650 | if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { |
625 | wait_on_page_locked(page); | 651 | up_read(&mm->mmap_sem); |
652 | wait_on_page_locked(page); | ||
653 | } | ||
626 | return 0; | 654 | return 0; |
627 | } | 655 | } |
628 | } | 656 | } |
@@ -782,9 +810,13 @@ repeat: | |||
782 | page = radix_tree_deref_slot((void **)pages[i]); | 810 | page = radix_tree_deref_slot((void **)pages[i]); |
783 | if (unlikely(!page)) | 811 | if (unlikely(!page)) |
784 | continue; | 812 | continue; |
813 | |||
814 | /* | ||
815 | * This can only trigger when the entry at index 0 moves out | ||
816 | * of or back to the root: none yet gotten, safe to restart. | ||
817 | */ | ||
785 | if (radix_tree_deref_retry(page)) { | 818 | if (radix_tree_deref_retry(page)) { |
786 | if (ret) | 819 | WARN_ON(start | i); |
787 | start = pages[ret-1]->index; | ||
788 | goto restart; | 820 | goto restart; |
789 | } | 821 | } |
790 | 822 | ||
@@ -800,6 +832,13 @@ repeat: | |||
800 | pages[ret] = page; | 832 | pages[ret] = page; |
801 | ret++; | 833 | ret++; |
802 | } | 834 | } |
835 | |||
836 | /* | ||
837 | * If all entries were removed before we could secure them, | ||
838 | * try again, because callers stop trying once 0 is returned. | ||
839 | */ | ||
840 | if (unlikely(!ret && nr_found)) | ||
841 | goto restart; | ||
803 | rcu_read_unlock(); | 842 | rcu_read_unlock(); |
804 | return ret; | 843 | return ret; |
805 | } | 844 | } |
@@ -834,6 +873,11 @@ repeat: | |||
834 | page = radix_tree_deref_slot((void **)pages[i]); | 873 | page = radix_tree_deref_slot((void **)pages[i]); |
835 | if (unlikely(!page)) | 874 | if (unlikely(!page)) |
836 | continue; | 875 | continue; |
876 | |||
877 | /* | ||
878 | * This can only trigger when the entry at index 0 moves out | ||
879 | * of or back to the root: none yet gotten, safe to restart. | ||
880 | */ | ||
837 | if (radix_tree_deref_retry(page)) | 881 | if (radix_tree_deref_retry(page)) |
838 | goto restart; | 882 | goto restart; |
839 | 883 | ||
@@ -894,6 +938,11 @@ repeat: | |||
894 | page = radix_tree_deref_slot((void **)pages[i]); | 938 | page = radix_tree_deref_slot((void **)pages[i]); |
895 | if (unlikely(!page)) | 939 | if (unlikely(!page)) |
896 | continue; | 940 | continue; |
941 | |||
942 | /* | ||
943 | * This can only trigger when the entry at index 0 moves out | ||
944 | * of or back to the root: none yet gotten, safe to restart. | ||
945 | */ | ||
897 | if (radix_tree_deref_retry(page)) | 946 | if (radix_tree_deref_retry(page)) |
898 | goto restart; | 947 | goto restart; |
899 | 948 | ||
@@ -909,6 +958,13 @@ repeat: | |||
909 | pages[ret] = page; | 958 | pages[ret] = page; |
910 | ret++; | 959 | ret++; |
911 | } | 960 | } |
961 | |||
962 | /* | ||
963 | * If all entries were removed before we could secure them, | ||
964 | * try again, because callers stop trying once 0 is returned. | ||
965 | */ | ||
966 | if (unlikely(!ret && nr_found)) | ||
967 | goto restart; | ||
912 | rcu_read_unlock(); | 968 | rcu_read_unlock(); |
913 | 969 | ||
914 | if (ret) | 970 | if (ret) |
@@ -1298,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1298 | unsigned long seg = 0; | 1354 | unsigned long seg = 0; |
1299 | size_t count; | 1355 | size_t count; |
1300 | loff_t *ppos = &iocb->ki_pos; | 1356 | loff_t *ppos = &iocb->ki_pos; |
1357 | struct blk_plug plug; | ||
1301 | 1358 | ||
1302 | count = 0; | 1359 | count = 0; |
1303 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1360 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1304 | if (retval) | 1361 | if (retval) |
1305 | return retval; | 1362 | return retval; |
1306 | 1363 | ||
1364 | blk_start_plug(&plug); | ||
1365 | |||
1307 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1366 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1308 | if (filp->f_flags & O_DIRECT) { | 1367 | if (filp->f_flags & O_DIRECT) { |
1309 | loff_t size; | 1368 | loff_t size; |
@@ -1376,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1376 | break; | 1435 | break; |
1377 | } | 1436 | } |
1378 | out: | 1437 | out: |
1438 | blk_finish_plug(&plug); | ||
1379 | return retval; | 1439 | return retval; |
1380 | } | 1440 | } |
1381 | EXPORT_SYMBOL(generic_file_aio_read); | 1441 | EXPORT_SYMBOL(generic_file_aio_read); |
@@ -2487,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2487 | { | 2547 | { |
2488 | struct file *file = iocb->ki_filp; | 2548 | struct file *file = iocb->ki_filp; |
2489 | struct inode *inode = file->f_mapping->host; | 2549 | struct inode *inode = file->f_mapping->host; |
2550 | struct blk_plug plug; | ||
2490 | ssize_t ret; | 2551 | ssize_t ret; |
2491 | 2552 | ||
2492 | BUG_ON(iocb->ki_pos != pos); | 2553 | BUG_ON(iocb->ki_pos != pos); |
2493 | 2554 | ||
2494 | mutex_lock(&inode->i_mutex); | 2555 | mutex_lock(&inode->i_mutex); |
2556 | blk_start_plug(&plug); | ||
2495 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2557 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2496 | mutex_unlock(&inode->i_mutex); | 2558 | mutex_unlock(&inode->i_mutex); |
2497 | 2559 | ||
@@ -2502,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2502 | if (err < 0 && ret > 0) | 2564 | if (err < 0 && ret > 0) |
2503 | ret = err; | 2565 | ret = err; |
2504 | } | 2566 | } |
2567 | blk_finish_plug(&plug); | ||
2505 | return ret; | 2568 | return ret; |
2506 | } | 2569 | } |
2507 | EXPORT_SYMBOL(generic_file_aio_write); | 2570 | EXPORT_SYMBOL(generic_file_aio_write); |