aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-04-26 04:22:15 -0400
committerJiri Kosina <jkosina@suse.cz>2011-04-26 04:22:59 -0400
commit07f9479a40cc778bc1462ada11f95b01360ae4ff (patch)
tree0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /mm/filemap.c
parent9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff)
parentcd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff)
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be applied for files that didn't exist on the old branch.
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c211
1 files changed, 137 insertions, 74 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468b..c641edf553a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
80 * ->i_mutex 80 * ->i_mutex
81 * ->i_alloc_sem (various) 81 * ->i_alloc_sem (various)
82 * 82 *
83 * ->inode_lock 83 * inode_wb_list_lock
84 * ->sb_lock (fs/fs-writeback.c) 84 * sb_lock (fs/fs-writeback.c)
85 * ->mapping->tree_lock (__sync_single_inode) 85 * ->mapping->tree_lock (__sync_single_inode)
86 * 86 *
87 * ->i_mmap_lock 87 * ->i_mmap_lock
@@ -98,8 +98,10 @@
98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
99 * ->private_lock (page_remove_rmap->set_page_dirty) 99 * ->private_lock (page_remove_rmap->set_page_dirty)
100 * ->tree_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty)
101 * ->inode_lock (page_remove_rmap->set_page_dirty) 101 * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
103 * inode_wb_list_lock (zap_pte_range->set_page_dirty)
104 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 106 *
105 * (code doesn't rely on that order, so you could switch it around) 107 * (code doesn't rely on that order, so you could switch it around)
@@ -108,11 +110,11 @@
108 */ 110 */
109 111
110/* 112/*
111 * Remove a page from the page cache and free it. Caller has to make 113 * Delete a page from the page cache and free it. Caller has to make
112 * sure the page is locked and that nobody else uses it - or that usage 114 * sure the page is locked and that nobody else uses it - or that usage
113 * is safe. The caller must hold the mapping's tree_lock. 115 * is safe. The caller must hold the mapping's tree_lock.
114 */ 116 */
115void __remove_from_page_cache(struct page *page) 117void __delete_from_page_cache(struct page *page)
116{ 118{
117 struct address_space *mapping = page->mapping; 119 struct address_space *mapping = page->mapping;
118 120
@@ -137,7 +139,15 @@ void __remove_from_page_cache(struct page *page)
137 } 139 }
138} 140}
139 141
140void remove_from_page_cache(struct page *page) 142/**
143 * delete_from_page_cache - delete page from page cache
144 * @page: the page which the kernel is trying to remove from page cache
145 *
146 * This must be called only on pages that have been verified to be in the page
147 * cache and locked. It will never put the page into the free list, the caller
148 * has a reference on the page.
149 */
150void delete_from_page_cache(struct page *page)
141{ 151{
142 struct address_space *mapping = page->mapping; 152 struct address_space *mapping = page->mapping;
143 void (*freepage)(struct page *); 153 void (*freepage)(struct page *);
@@ -146,54 +156,25 @@ void remove_from_page_cache(struct page *page)
146 156
147 freepage = mapping->a_ops->freepage; 157 freepage = mapping->a_ops->freepage;
148 spin_lock_irq(&mapping->tree_lock); 158 spin_lock_irq(&mapping->tree_lock);
149 __remove_from_page_cache(page); 159 __delete_from_page_cache(page);
150 spin_unlock_irq(&mapping->tree_lock); 160 spin_unlock_irq(&mapping->tree_lock);
151 mem_cgroup_uncharge_cache_page(page); 161 mem_cgroup_uncharge_cache_page(page);
152 162
153 if (freepage) 163 if (freepage)
154 freepage(page); 164 freepage(page);
165 page_cache_release(page);
155} 166}
156EXPORT_SYMBOL(remove_from_page_cache); 167EXPORT_SYMBOL(delete_from_page_cache);
157 168
158static int sync_page(void *word) 169static int sleep_on_page(void *word)
159{ 170{
160 struct address_space *mapping;
161 struct page *page;
162
163 page = container_of((unsigned long *)word, struct page, flags);
164
165 /*
166 * page_mapping() is being called without PG_locked held.
167 * Some knowledge of the state and use of the page is used to
168 * reduce the requirements down to a memory barrier.
169 * The danger here is of a stale page_mapping() return value
170 * indicating a struct address_space different from the one it's
171 * associated with when it is associated with one.
172 * After smp_mb(), it's either the correct page_mapping() for
173 * the page, or an old page_mapping() and the page's own
174 * page_mapping() has gone NULL.
175 * The ->sync_page() address_space operation must tolerate
176 * page_mapping() going NULL. By an amazing coincidence,
177 * this comes about because none of the users of the page
178 * in the ->sync_page() methods make essential use of the
179 * page_mapping(), merely passing the page down to the backing
180 * device's unplug functions when it's non-NULL, which in turn
181 * ignore it for all cases but swap, where only page_private(page) is
182 * of interest. When page_mapping() does go NULL, the entire
183 * call stack gracefully ignores the page and returns.
184 * -- wli
185 */
186 smp_mb();
187 mapping = page_mapping(page);
188 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
189 mapping->a_ops->sync_page(page);
190 io_schedule(); 171 io_schedule();
191 return 0; 172 return 0;
192} 173}
193 174
194static int sync_page_killable(void *word) 175static int sleep_on_page_killable(void *word)
195{ 176{
196 sync_page(word); 177 sleep_on_page(word);
197 return fatal_signal_pending(current) ? -EINTR : 0; 178 return fatal_signal_pending(current) ? -EINTR : 0;
198} 179}
199 180
@@ -387,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
387EXPORT_SYMBOL(filemap_write_and_wait_range); 368EXPORT_SYMBOL(filemap_write_and_wait_range);
388 369
389/** 370/**
371 * replace_page_cache_page - replace a pagecache page with a new one
372 * @old: page to be replaced
373 * @new: page to replace with
374 * @gfp_mask: allocation mode
375 *
376 * This function replaces a page in the pagecache with a new one. On
377 * success it acquires the pagecache reference for the new page and
378 * drops it for the old page. Both the old and new pages must be
379 * locked. This function does not add the new page to the LRU, the
380 * caller must do that.
381 *
382 * The remove + add is atomic. The only way this function can fail is
383 * memory allocation failure.
384 */
385int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
386{
387 int error;
388 struct mem_cgroup *memcg = NULL;
389
390 VM_BUG_ON(!PageLocked(old));
391 VM_BUG_ON(!PageLocked(new));
392 VM_BUG_ON(new->mapping);
393
394 /*
395 * This is not page migration, but prepare_migration and
396 * end_migration does enough work for charge replacement.
397 *
398 * In the longer term we probably want a specialized function
399 * for moving the charge from old to new in a more efficient
400 * manner.
401 */
402 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
403 if (error)
404 return error;
405
406 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
407 if (!error) {
408 struct address_space *mapping = old->mapping;
409 void (*freepage)(struct page *);
410
411 pgoff_t offset = old->index;
412 freepage = mapping->a_ops->freepage;
413
414 page_cache_get(new);
415 new->mapping = mapping;
416 new->index = offset;
417
418 spin_lock_irq(&mapping->tree_lock);
419 __delete_from_page_cache(old);
420 error = radix_tree_insert(&mapping->page_tree, offset, new);
421 BUG_ON(error);
422 mapping->nrpages++;
423 __inc_zone_page_state(new, NR_FILE_PAGES);
424 if (PageSwapBacked(new))
425 __inc_zone_page_state(new, NR_SHMEM);
426 spin_unlock_irq(&mapping->tree_lock);
427 radix_tree_preload_end();
428 if (freepage)
429 freepage(old);
430 page_cache_release(old);
431 mem_cgroup_end_migration(memcg, old, new, true);
432 } else {
433 mem_cgroup_end_migration(memcg, old, new, false);
434 }
435
436 return error;
437}
438EXPORT_SYMBOL_GPL(replace_page_cache_page);
439
440/**
390 * add_to_page_cache_locked - add a locked page to the pagecache 441 * add_to_page_cache_locked - add a locked page to the pagecache
391 * @page: page to add 442 * @page: page to add
392 * @mapping: the page's address_space 443 * @mapping: the page's address_space
@@ -479,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
479EXPORT_SYMBOL(__page_cache_alloc); 530EXPORT_SYMBOL(__page_cache_alloc);
480#endif 531#endif
481 532
482static int __sleep_on_page_lock(void *word)
483{
484 io_schedule();
485 return 0;
486}
487
488/* 533/*
489 * In order to wait for pages to become available there must be 534 * In order to wait for pages to become available there must be
490 * waitqueues associated with pages. By using a hash table of 535 * waitqueues associated with pages. By using a hash table of
@@ -512,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
512 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 557 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
513 558
514 if (test_bit(bit_nr, &page->flags)) 559 if (test_bit(bit_nr, &page->flags))
515 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 560 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
516 TASK_UNINTERRUPTIBLE); 561 TASK_UNINTERRUPTIBLE);
517} 562}
518EXPORT_SYMBOL(wait_on_page_bit); 563EXPORT_SYMBOL(wait_on_page_bit);
@@ -576,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback);
576/** 621/**
577 * __lock_page - get a lock on the page, assuming we need to sleep to get it 622 * __lock_page - get a lock on the page, assuming we need to sleep to get it
578 * @page: the page to lock 623 * @page: the page to lock
579 *
580 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
581 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
582 * chances are that on the second loop, the block layer's plug list is empty,
583 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
584 */ 624 */
585void __lock_page(struct page *page) 625void __lock_page(struct page *page)
586{ 626{
587 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 627 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
588 628
589 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 629 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
590 TASK_UNINTERRUPTIBLE); 630 TASK_UNINTERRUPTIBLE);
591} 631}
592EXPORT_SYMBOL(__lock_page); 632EXPORT_SYMBOL(__lock_page);
@@ -596,24 +636,10 @@ int __lock_page_killable(struct page *page)
596 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 636 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
597 637
598 return __wait_on_bit_lock(page_waitqueue(page), &wait, 638 return __wait_on_bit_lock(page_waitqueue(page), &wait,
599 sync_page_killable, TASK_KILLABLE); 639 sleep_on_page_killable, TASK_KILLABLE);
600} 640}
601EXPORT_SYMBOL_GPL(__lock_page_killable); 641EXPORT_SYMBOL_GPL(__lock_page_killable);
602 642
603/**
604 * __lock_page_nosync - get a lock on the page, without calling sync_page()
605 * @page: the page to lock
606 *
607 * Variant of lock_page that does not require the caller to hold a reference
608 * on the page's mapping.
609 */
610void __lock_page_nosync(struct page *page)
611{
612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
614 TASK_UNINTERRUPTIBLE);
615}
616
617int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 643int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
618 unsigned int flags) 644 unsigned int flags)
619{ 645{
@@ -621,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
621 __lock_page(page); 647 __lock_page(page);
622 return 1; 648 return 1;
623 } else { 649 } else {
624 up_read(&mm->mmap_sem); 650 if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
625 wait_on_page_locked(page); 651 up_read(&mm->mmap_sem);
652 wait_on_page_locked(page);
653 }
626 return 0; 654 return 0;
627 } 655 }
628} 656}
@@ -782,9 +810,13 @@ repeat:
782 page = radix_tree_deref_slot((void **)pages[i]); 810 page = radix_tree_deref_slot((void **)pages[i]);
783 if (unlikely(!page)) 811 if (unlikely(!page))
784 continue; 812 continue;
813
814 /*
815 * This can only trigger when the entry at index 0 moves out
816 * of or back to the root: none yet gotten, safe to restart.
817 */
785 if (radix_tree_deref_retry(page)) { 818 if (radix_tree_deref_retry(page)) {
786 if (ret) 819 WARN_ON(start | i);
787 start = pages[ret-1]->index;
788 goto restart; 820 goto restart;
789 } 821 }
790 822
@@ -800,6 +832,13 @@ repeat:
800 pages[ret] = page; 832 pages[ret] = page;
801 ret++; 833 ret++;
802 } 834 }
835
836 /*
837 * If all entries were removed before we could secure them,
838 * try again, because callers stop trying once 0 is returned.
839 */
840 if (unlikely(!ret && nr_found))
841 goto restart;
803 rcu_read_unlock(); 842 rcu_read_unlock();
804 return ret; 843 return ret;
805} 844}
@@ -834,6 +873,11 @@ repeat:
834 page = radix_tree_deref_slot((void **)pages[i]); 873 page = radix_tree_deref_slot((void **)pages[i]);
835 if (unlikely(!page)) 874 if (unlikely(!page))
836 continue; 875 continue;
876
877 /*
878 * This can only trigger when the entry at index 0 moves out
879 * of or back to the root: none yet gotten, safe to restart.
880 */
837 if (radix_tree_deref_retry(page)) 881 if (radix_tree_deref_retry(page))
838 goto restart; 882 goto restart;
839 883
@@ -894,6 +938,11 @@ repeat:
894 page = radix_tree_deref_slot((void **)pages[i]); 938 page = radix_tree_deref_slot((void **)pages[i]);
895 if (unlikely(!page)) 939 if (unlikely(!page))
896 continue; 940 continue;
941
942 /*
943 * This can only trigger when the entry at index 0 moves out
944 * of or back to the root: none yet gotten, safe to restart.
945 */
897 if (radix_tree_deref_retry(page)) 946 if (radix_tree_deref_retry(page))
898 goto restart; 947 goto restart;
899 948
@@ -909,6 +958,13 @@ repeat:
909 pages[ret] = page; 958 pages[ret] = page;
910 ret++; 959 ret++;
911 } 960 }
961
962 /*
963 * If all entries were removed before we could secure them,
964 * try again, because callers stop trying once 0 is returned.
965 */
966 if (unlikely(!ret && nr_found))
967 goto restart;
912 rcu_read_unlock(); 968 rcu_read_unlock();
913 969
914 if (ret) 970 if (ret)
@@ -1298,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1298 unsigned long seg = 0; 1354 unsigned long seg = 0;
1299 size_t count; 1355 size_t count;
1300 loff_t *ppos = &iocb->ki_pos; 1356 loff_t *ppos = &iocb->ki_pos;
1357 struct blk_plug plug;
1301 1358
1302 count = 0; 1359 count = 0;
1303 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1360 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1304 if (retval) 1361 if (retval)
1305 return retval; 1362 return retval;
1306 1363
1364 blk_start_plug(&plug);
1365
1307 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1366 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1308 if (filp->f_flags & O_DIRECT) { 1367 if (filp->f_flags & O_DIRECT) {
1309 loff_t size; 1368 loff_t size;
@@ -1376,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1376 break; 1435 break;
1377 } 1436 }
1378out: 1437out:
1438 blk_finish_plug(&plug);
1379 return retval; 1439 return retval;
1380} 1440}
1381EXPORT_SYMBOL(generic_file_aio_read); 1441EXPORT_SYMBOL(generic_file_aio_read);
@@ -2487,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2487{ 2547{
2488 struct file *file = iocb->ki_filp; 2548 struct file *file = iocb->ki_filp;
2489 struct inode *inode = file->f_mapping->host; 2549 struct inode *inode = file->f_mapping->host;
2550 struct blk_plug plug;
2490 ssize_t ret; 2551 ssize_t ret;
2491 2552
2492 BUG_ON(iocb->ki_pos != pos); 2553 BUG_ON(iocb->ki_pos != pos);
2493 2554
2494 mutex_lock(&inode->i_mutex); 2555 mutex_lock(&inode->i_mutex);
2556 blk_start_plug(&plug);
2495 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2557 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2496 mutex_unlock(&inode->i_mutex); 2558 mutex_unlock(&inode->i_mutex);
2497 2559
@@ -2502,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2502 if (err < 0 && ret > 0) 2564 if (err < 0 && ret > 0)
2503 ret = err; 2565 ret = err;
2504 } 2566 }
2567 blk_finish_plug(&plug);
2505 return ret; 2568 return ret;
2506} 2569}
2507EXPORT_SYMBOL(generic_file_aio_write); 2570EXPORT_SYMBOL(generic_file_aio_write);