aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c564
1 files changed, 460 insertions, 104 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index f99ff3e50bd6..bd106361be4b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h> 54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h> 55#include <linux/percpu_counter.h>
56#include <linux/falloc.h>
56#include <linux/splice.h> 57#include <linux/splice.h>
57#include <linux/security.h> 58#include <linux/security.h>
58#include <linux/swapops.h> 59#include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
83 char value[0]; 84 char value[0];
84}; 85};
85 86
87/*
88 * shmem_fallocate and shmem_writepage communicate via inode->i_private
89 * (with i_mutex making sure that it has only one user at a time):
90 * we would prefer not to enlarge the shmem inode just for that.
91 */
92struct shmem_falloc {
93 pgoff_t start; /* start of range currently being fallocated */
94 pgoff_t next; /* the next page offset to be fallocated */
95 pgoff_t nr_falloced; /* how many new pages have been fallocated */
96 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
97};
98
86/* Flag allocation requirements to shmem_getpage */ 99/* Flag allocation requirements to shmem_getpage */
87enum sgp_type { 100enum sgp_type {
88 SGP_READ, /* don't exceed i_size, don't allocate page */ 101 SGP_READ, /* don't exceed i_size, don't allocate page */
89 SGP_CACHE, /* don't exceed i_size, may allocate page */ 102 SGP_CACHE, /* don't exceed i_size, may allocate page */
90 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 103 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
91 SGP_WRITE, /* may exceed i_size, may allocate page */ 104 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
105 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
92}; 106};
93 107
94#ifdef CONFIG_TMPFS 108#ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
103} 117}
104#endif 118#endif
105 119
120static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
121static int shmem_replace_page(struct page **pagep, gfp_t gfp,
122 struct shmem_inode_info *info, pgoff_t index);
106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 123static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 124 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
108 125
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
247} 264}
248 265
249/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
250 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
251 */ 286 */
252static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
253 struct address_space *mapping, 288 struct address_space *mapping,
254 pgoff_t index, gfp_t gfp, void *expected) 289 pgoff_t index, gfp_t gfp, void *expected)
255{ 290{
256 int error = 0; 291 int error;
257 292
258 VM_BUG_ON(!PageLocked(page)); 293 VM_BUG_ON(!PageLocked(page));
259 VM_BUG_ON(!PageSwapBacked(page)); 294 VM_BUG_ON(!PageSwapBacked(page));
260 295
296 page_cache_get(page);
297 page->mapping = mapping;
298 page->index = index;
299
300 spin_lock_irq(&mapping->tree_lock);
261 if (!expected) 301 if (!expected)
262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 302 error = radix_tree_insert(&mapping->page_tree, index, page);
303 else
304 error = shmem_radix_tree_replace(mapping, index, expected,
305 page);
263 if (!error) { 306 if (!error) {
264 page_cache_get(page); 307 mapping->nrpages++;
265 page->mapping = mapping; 308 __inc_zone_page_state(page, NR_FILE_PAGES);
266 page->index = index; 309 __inc_zone_page_state(page, NR_SHMEM);
267 310 spin_unlock_irq(&mapping->tree_lock);
268 spin_lock_irq(&mapping->tree_lock); 311 } else {
269 if (!expected) 312 page->mapping = NULL;
270 error = radix_tree_insert(&mapping->page_tree, 313 spin_unlock_irq(&mapping->tree_lock);
271 index, page); 314 page_cache_release(page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
284 }
285 if (!expected)
286 radix_tree_preload_end();
287 } 315 }
288 if (error)
289 mem_cgroup_uncharge_cache_page(page);
290 return error; 316 return error;
291} 317}
292 318
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
423 449
424/* 450/*
425 * Remove range of pages and swap entries from radix tree, and free them. 451 * Remove range of pages and swap entries from radix tree, and free them.
452 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
426 */ 453 */
427void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 454static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
455 bool unfalloc)
428{ 456{
429 struct address_space *mapping = inode->i_mapping; 457 struct address_space *mapping = inode->i_mapping;
430 struct shmem_inode_info *info = SHMEM_I(inode); 458 struct shmem_inode_info *info = SHMEM_I(inode);
431 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 459 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
432 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 460 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
433 pgoff_t end = (lend >> PAGE_CACHE_SHIFT); 461 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
462 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
434 struct pagevec pvec; 463 struct pagevec pvec;
435 pgoff_t indices[PAGEVEC_SIZE]; 464 pgoff_t indices[PAGEVEC_SIZE];
436 long nr_swaps_freed = 0; 465 long nr_swaps_freed = 0;
437 pgoff_t index; 466 pgoff_t index;
438 int i; 467 int i;
439 468
440 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 469 if (lend == -1)
470 end = -1; /* unsigned, so actually very big */
441 471
442 pagevec_init(&pvec, 0); 472 pagevec_init(&pvec, 0);
443 index = start; 473 index = start;
444 while (index <= end) { 474 while (index < end) {
445 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 475 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
446 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 476 min(end - index, (pgoff_t)PAGEVEC_SIZE),
447 pvec.pages, indices); 477 pvec.pages, indices);
448 if (!pvec.nr) 478 if (!pvec.nr)
449 break; 479 break;
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
452 struct page *page = pvec.pages[i]; 482 struct page *page = pvec.pages[i];
453 483
454 index = indices[i]; 484 index = indices[i];
455 if (index > end) 485 if (index >= end)
456 break; 486 break;
457 487
458 if (radix_tree_exceptional_entry(page)) { 488 if (radix_tree_exceptional_entry(page)) {
489 if (unfalloc)
490 continue;
459 nr_swaps_freed += !shmem_free_swap(mapping, 491 nr_swaps_freed += !shmem_free_swap(mapping,
460 index, page); 492 index, page);
461 continue; 493 continue;
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
463 495
464 if (!trylock_page(page)) 496 if (!trylock_page(page))
465 continue; 497 continue;
466 if (page->mapping == mapping) { 498 if (!unfalloc || !PageUptodate(page)) {
467 VM_BUG_ON(PageWriteback(page)); 499 if (page->mapping == mapping) {
468 truncate_inode_page(mapping, page); 500 VM_BUG_ON(PageWriteback(page));
501 truncate_inode_page(mapping, page);
502 }
469 } 503 }
470 unlock_page(page); 504 unlock_page(page);
471 } 505 }
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
476 index++; 510 index++;
477 } 511 }
478 512
479 if (partial) { 513 if (partial_start) {
480 struct page *page = NULL; 514 struct page *page = NULL;
481 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 515 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
482 if (page) { 516 if (page) {
483 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 517 unsigned int top = PAGE_CACHE_SIZE;
518 if (start > end) {
519 top = partial_end;
520 partial_end = 0;
521 }
522 zero_user_segment(page, partial_start, top);
484 set_page_dirty(page); 523 set_page_dirty(page);
485 unlock_page(page); 524 unlock_page(page);
486 page_cache_release(page); 525 page_cache_release(page);
487 } 526 }
488 } 527 }
528 if (partial_end) {
529 struct page *page = NULL;
530 shmem_getpage(inode, end, &page, SGP_READ, NULL);
531 if (page) {
532 zero_user_segment(page, 0, partial_end);
533 set_page_dirty(page);
534 unlock_page(page);
535 page_cache_release(page);
536 }
537 }
538 if (start >= end)
539 return;
489 540
490 index = start; 541 index = start;
491 for ( ; ; ) { 542 for ( ; ; ) {
492 cond_resched(); 543 cond_resched();
493 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 544 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
494 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 545 min(end - index, (pgoff_t)PAGEVEC_SIZE),
495 pvec.pages, indices); 546 pvec.pages, indices);
496 if (!pvec.nr) { 547 if (!pvec.nr) {
497 if (index == start) 548 if (index == start || unfalloc)
498 break; 549 break;
499 index = start; 550 index = start;
500 continue; 551 continue;
501 } 552 }
502 if (index == start && indices[0] > end) { 553 if ((index == start || unfalloc) && indices[0] >= end) {
503 shmem_deswap_pagevec(&pvec); 554 shmem_deswap_pagevec(&pvec);
504 pagevec_release(&pvec); 555 pagevec_release(&pvec);
505 break; 556 break;
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
509 struct page *page = pvec.pages[i]; 560 struct page *page = pvec.pages[i];
510 561
511 index = indices[i]; 562 index = indices[i];
512 if (index > end) 563 if (index >= end)
513 break; 564 break;
514 565
515 if (radix_tree_exceptional_entry(page)) { 566 if (radix_tree_exceptional_entry(page)) {
567 if (unfalloc)
568 continue;
516 nr_swaps_freed += !shmem_free_swap(mapping, 569 nr_swaps_freed += !shmem_free_swap(mapping,
517 index, page); 570 index, page);
518 continue; 571 continue;
519 } 572 }
520 573
521 lock_page(page); 574 lock_page(page);
522 if (page->mapping == mapping) { 575 if (!unfalloc || !PageUptodate(page)) {
523 VM_BUG_ON(PageWriteback(page)); 576 if (page->mapping == mapping) {
524 truncate_inode_page(mapping, page); 577 VM_BUG_ON(PageWriteback(page));
578 truncate_inode_page(mapping, page);
579 }
525 } 580 }
526 unlock_page(page); 581 unlock_page(page);
527 } 582 }
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
535 info->swapped -= nr_swaps_freed; 590 info->swapped -= nr_swaps_freed;
536 shmem_recalc_inode(inode); 591 shmem_recalc_inode(inode);
537 spin_unlock(&info->lock); 592 spin_unlock(&info->lock);
593}
538 594
595void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
596{
597 shmem_undo_range(inode, lstart, lend, false);
539 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 598 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
540} 599}
541EXPORT_SYMBOL_GPL(shmem_truncate_range); 600EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode)
597 } 656 }
598 BUG_ON(inode->i_blocks); 657 BUG_ON(inode->i_blocks);
599 shmem_free_inode(inode->i_sb); 658 shmem_free_inode(inode->i_sb);
600 end_writeback(inode); 659 clear_inode(inode);
601} 660}
602 661
603/* 662/*
604 * If swap found in inode, free it and move page from swapcache to filecache. 663 * If swap found in inode, free it and move page from swapcache to filecache.
605 */ 664 */
606static int shmem_unuse_inode(struct shmem_inode_info *info, 665static int shmem_unuse_inode(struct shmem_inode_info *info,
607 swp_entry_t swap, struct page *page) 666 swp_entry_t swap, struct page **pagep)
608{ 667{
609 struct address_space *mapping = info->vfs_inode.i_mapping; 668 struct address_space *mapping = info->vfs_inode.i_mapping;
610 void *radswap; 669 void *radswap;
611 pgoff_t index; 670 pgoff_t index;
612 int error; 671 gfp_t gfp;
672 int error = 0;
613 673
614 radswap = swp_to_radix_entry(swap); 674 radswap = swp_to_radix_entry(swap);
615 index = radix_tree_locate_item(&mapping->page_tree, radswap); 675 index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
625 if (shmem_swaplist.next != &info->swaplist) 685 if (shmem_swaplist.next != &info->swaplist)
626 list_move_tail(&shmem_swaplist, &info->swaplist); 686 list_move_tail(&shmem_swaplist, &info->swaplist);
627 687
688 gfp = mapping_gfp_mask(mapping);
689 if (shmem_should_replace_page(*pagep, gfp)) {
690 mutex_unlock(&shmem_swaplist_mutex);
691 error = shmem_replace_page(pagep, gfp, info, index);
692 mutex_lock(&shmem_swaplist_mutex);
693 /*
694 * We needed to drop mutex to make that restrictive page
695 * allocation, but the inode might have been freed while we
696 * dropped it: although a racing shmem_evict_inode() cannot
697 * complete without emptying the radix_tree, our page lock
698 * on this swapcache page is not enough to prevent that -
699 * free_swap_and_cache() of our swap entry will only
700 * trylock_page(), removing swap from radix_tree whatever.
701 *
702 * We must not proceed to shmem_add_to_page_cache() if the
703 * inode has been freed, but of course we cannot rely on
704 * inode or mapping or info to check that. However, we can
705 * safely check if our swap entry is still in use (and here
706 * it can't have got reused for another page): if it's still
707 * in use, then the inode cannot have been freed yet, and we
708 * can safely proceed (if it's no longer in use, that tells
709 * nothing about the inode, but we don't need to unuse swap).
710 */
711 if (!page_swapcount(*pagep))
712 error = -ENOENT;
713 }
714
628 /* 715 /*
629 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 716 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
630 * but also to hold up shmem_evict_inode(): so inode cannot be freed 717 * but also to hold up shmem_evict_inode(): so inode cannot be freed
631 * beneath us (pagelock doesn't help until the page is in pagecache). 718 * beneath us (pagelock doesn't help until the page is in pagecache).
632 */ 719 */
633 error = shmem_add_to_page_cache(page, mapping, index, 720 if (!error)
721 error = shmem_add_to_page_cache(*pagep, mapping, index,
634 GFP_NOWAIT, radswap); 722 GFP_NOWAIT, radswap);
635 /* which does mem_cgroup_uncharge_cache_page on error */
636
637 if (error != -ENOMEM) { 723 if (error != -ENOMEM) {
638 /* 724 /*
639 * Truncation and eviction use free_swap_and_cache(), which 725 * Truncation and eviction use free_swap_and_cache(), which
640 * only does trylock page: if we raced, best clean up here. 726 * only does trylock page: if we raced, best clean up here.
641 */ 727 */
642 delete_from_swap_cache(page); 728 delete_from_swap_cache(*pagep);
643 set_page_dirty(page); 729 set_page_dirty(*pagep);
644 if (!error) { 730 if (!error) {
645 spin_lock(&info->lock); 731 spin_lock(&info->lock);
646 info->swapped--; 732 info->swapped--;
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
660 struct list_head *this, *next; 746 struct list_head *this, *next;
661 struct shmem_inode_info *info; 747 struct shmem_inode_info *info;
662 int found = 0; 748 int found = 0;
663 int error; 749 int error = 0;
750
751 /*
752 * There's a faint possibility that swap page was replaced before
753 * caller locked it: caller will come back later with the right page.
754 */
755 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
756 goto out;
664 757
665 /* 758 /*
666 * Charge page using GFP_KERNEL while we can wait, before taking 759 * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
676 list_for_each_safe(this, next, &shmem_swaplist) { 769 list_for_each_safe(this, next, &shmem_swaplist) {
677 info = list_entry(this, struct shmem_inode_info, swaplist); 770 info = list_entry(this, struct shmem_inode_info, swaplist);
678 if (info->swapped) 771 if (info->swapped)
679 found = shmem_unuse_inode(info, swap, page); 772 found = shmem_unuse_inode(info, swap, &page);
680 else 773 else
681 list_del_init(&info->swaplist); 774 list_del_init(&info->swaplist);
682 cond_resched(); 775 cond_resched();
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
685 } 778 }
686 mutex_unlock(&shmem_swaplist_mutex); 779 mutex_unlock(&shmem_swaplist_mutex);
687 780
688 if (!found)
689 mem_cgroup_uncharge_cache_page(page);
690 if (found < 0) 781 if (found < 0)
691 error = found; 782 error = found;
692out: 783out:
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
727 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 818 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
728 goto redirty; 819 goto redirty;
729 } 820 }
821
822 /*
823 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
824 * value into swapfile.c, the only way we can correctly account for a
825 * fallocated page arriving here is now to initialize it and write it.
826 *
827 * That's okay for a page already fallocated earlier, but if we have
828 * not yet completed the fallocation, then (a) we want to keep track
829 * of this page in case we have to undo it, and (b) it may not be a
830 * good idea to continue anyway, once we're pushing into swap. So
831 * reactivate the page, and let shmem_fallocate() quit when too many.
832 */
833 if (!PageUptodate(page)) {
834 if (inode->i_private) {
835 struct shmem_falloc *shmem_falloc;
836 spin_lock(&inode->i_lock);
837 shmem_falloc = inode->i_private;
838 if (shmem_falloc &&
839 index >= shmem_falloc->start &&
840 index < shmem_falloc->next)
841 shmem_falloc->nr_unswapped++;
842 else
843 shmem_falloc = NULL;
844 spin_unlock(&inode->i_lock);
845 if (shmem_falloc)
846 goto redirty;
847 }
848 clear_highpage(page);
849 flush_dcache_page(page);
850 SetPageUptodate(page);
851 }
852
730 swap = get_swap_page(); 853 swap = get_swap_page();
731 if (!swap.val) 854 if (!swap.val)
732 goto redirty; 855 goto redirty;
@@ -856,6 +979,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
856#endif 979#endif
857 980
858/* 981/*
982 * When a page is moved from swapcache to shmem filecache (either by the
983 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
984 * shmem_unuse_inode()), it may have been read in earlier from swap, in
985 * ignorance of the mapping it belongs to. If that mapping has special
986 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
987 * we may need to copy to a suitable page before moving to filecache.
988 *
989 * In a future release, this may well be extended to respect cpuset and
990 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
991 * but for now it is a simple matter of zone.
992 */
993static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
994{
995 return page_zonenum(page) > gfp_zone(gfp);
996}
997
998static int shmem_replace_page(struct page **pagep, gfp_t gfp,
999 struct shmem_inode_info *info, pgoff_t index)
1000{
1001 struct page *oldpage, *newpage;
1002 struct address_space *swap_mapping;
1003 pgoff_t swap_index;
1004 int error;
1005
1006 oldpage = *pagep;
1007 swap_index = page_private(oldpage);
1008 swap_mapping = page_mapping(oldpage);
1009
1010 /*
1011 * We have arrived here because our zones are constrained, so don't
1012 * limit chance of success by further cpuset and node constraints.
1013 */
1014 gfp &= ~GFP_CONSTRAINT_MASK;
1015 newpage = shmem_alloc_page(gfp, info, index);
1016 if (!newpage)
1017 return -ENOMEM;
1018
1019 page_cache_get(newpage);
1020 copy_highpage(newpage, oldpage);
1021 flush_dcache_page(newpage);
1022
1023 __set_page_locked(newpage);
1024 SetPageUptodate(newpage);
1025 SetPageSwapBacked(newpage);
1026 set_page_private(newpage, swap_index);
1027 SetPageSwapCache(newpage);
1028
1029 /*
1030 * Our caller will very soon move newpage out of swapcache, but it's
1031 * a nice clean interface for us to replace oldpage by newpage there.
1032 */
1033 spin_lock_irq(&swap_mapping->tree_lock);
1034 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1035 newpage);
1036 if (!error) {
1037 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1038 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1039 }
1040 spin_unlock_irq(&swap_mapping->tree_lock);
1041
1042 if (unlikely(error)) {
1043 /*
1044 * Is this possible? I think not, now that our callers check
1045 * both PageSwapCache and page_private after getting page lock;
1046 * but be defensive. Reverse old to newpage for clear and free.
1047 */
1048 oldpage = newpage;
1049 } else {
1050 mem_cgroup_replace_page_cache(oldpage, newpage);
1051 lru_cache_add_anon(newpage);
1052 *pagep = newpage;
1053 }
1054
1055 ClearPageSwapCache(oldpage);
1056 set_page_private(oldpage, 0);
1057
1058 unlock_page(oldpage);
1059 page_cache_release(oldpage);
1060 page_cache_release(oldpage);
1061 return error;
1062}
1063
1064/*
859 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1065 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
860 * 1066 *
861 * If we allocate a new one we do not mark it dirty. That's up to the 1067 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1078,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
872 swp_entry_t swap; 1078 swp_entry_t swap;
873 int error; 1079 int error;
874 int once = 0; 1080 int once = 0;
1081 int alloced = 0;
875 1082
876 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1083 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
877 return -EFBIG; 1084 return -EFBIG;
@@ -883,19 +1090,21 @@ repeat:
883 page = NULL; 1090 page = NULL;
884 } 1091 }
885 1092
886 if (sgp != SGP_WRITE && 1093 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
887 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1094 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
888 error = -EINVAL; 1095 error = -EINVAL;
889 goto failed; 1096 goto failed;
890 } 1097 }
891 1098
1099 /* fallocated page? */
1100 if (page && !PageUptodate(page)) {
1101 if (sgp != SGP_READ)
1102 goto clear;
1103 unlock_page(page);
1104 page_cache_release(page);
1105 page = NULL;
1106 }
892 if (page || (sgp == SGP_READ && !swap.val)) { 1107 if (page || (sgp == SGP_READ && !swap.val)) {
893 /*
894 * Once we can get the page lock, it must be uptodate:
895 * if there were an error in reading back from swap,
896 * the page would not be inserted into the filecache.
897 */
898 BUG_ON(page && !PageUptodate(page));
899 *pagep = page; 1108 *pagep = page;
900 return 0; 1109 return 0;
901 } 1110 }
@@ -923,26 +1132,31 @@ repeat:
923 1132
924 /* We have to do this with page locked to prevent races */ 1133 /* We have to do this with page locked to prevent races */
925 lock_page(page); 1134 lock_page(page);
1135 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1136 !shmem_confirm_swap(mapping, index, swap)) {
1137 error = -EEXIST; /* try again */
1138 goto unlock;
1139 }
926 if (!PageUptodate(page)) { 1140 if (!PageUptodate(page)) {
927 error = -EIO; 1141 error = -EIO;
928 goto failed; 1142 goto failed;
929 } 1143 }
930 wait_on_page_writeback(page); 1144 wait_on_page_writeback(page);
931 1145
932 /* Someone may have already done it for us */ 1146 if (shmem_should_replace_page(page, gfp)) {
933 if (page->mapping) { 1147 error = shmem_replace_page(&page, gfp, info, index);
934 if (page->mapping == mapping && 1148 if (error)
935 page->index == index) 1149 goto failed;
936 goto done;
937 error = -EEXIST;
938 goto failed;
939 } 1150 }
940 1151
941 error = mem_cgroup_cache_charge(page, current->mm, 1152 error = mem_cgroup_cache_charge(page, current->mm,
942 gfp & GFP_RECLAIM_MASK); 1153 gfp & GFP_RECLAIM_MASK);
943 if (!error) 1154 if (!error) {
944 error = shmem_add_to_page_cache(page, mapping, index, 1155 error = shmem_add_to_page_cache(page, mapping, index,
945 gfp, swp_to_radix_entry(swap)); 1156 gfp, swp_to_radix_entry(swap));
1157 /* We already confirmed swap, and make no allocation */
1158 VM_BUG_ON(error);
1159 }
946 if (error) 1160 if (error)
947 goto failed; 1161 goto failed;
948 1162
@@ -979,11 +1193,18 @@ repeat:
979 __set_page_locked(page); 1193 __set_page_locked(page);
980 error = mem_cgroup_cache_charge(page, current->mm, 1194 error = mem_cgroup_cache_charge(page, current->mm,
981 gfp & GFP_RECLAIM_MASK); 1195 gfp & GFP_RECLAIM_MASK);
982 if (!error)
983 error = shmem_add_to_page_cache(page, mapping, index,
984 gfp, NULL);
985 if (error) 1196 if (error)
986 goto decused; 1197 goto decused;
1198 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1199 if (!error) {
1200 error = shmem_add_to_page_cache(page, mapping, index,
1201 gfp, NULL);
1202 radix_tree_preload_end();
1203 }
1204 if (error) {
1205 mem_cgroup_uncharge_cache_page(page);
1206 goto decused;
1207 }
987 lru_cache_add_anon(page); 1208 lru_cache_add_anon(page);
988 1209
989 spin_lock(&info->lock); 1210 spin_lock(&info->lock);
@@ -991,19 +1212,36 @@ repeat:
991 inode->i_blocks += BLOCKS_PER_PAGE; 1212 inode->i_blocks += BLOCKS_PER_PAGE;
992 shmem_recalc_inode(inode); 1213 shmem_recalc_inode(inode);
993 spin_unlock(&info->lock); 1214 spin_unlock(&info->lock);
1215 alloced = true;
994 1216
995 clear_highpage(page); 1217 /*
996 flush_dcache_page(page); 1218 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
997 SetPageUptodate(page); 1219 */
1220 if (sgp == SGP_FALLOC)
1221 sgp = SGP_WRITE;
1222clear:
1223 /*
1224 * Let SGP_WRITE caller clear ends if write does not fill page;
1225 * but SGP_FALLOC on a page fallocated earlier must initialize
1226 * it now, lest undo on failure cancel our earlier guarantee.
1227 */
1228 if (sgp != SGP_WRITE) {
1229 clear_highpage(page);
1230 flush_dcache_page(page);
1231 SetPageUptodate(page);
1232 }
998 if (sgp == SGP_DIRTY) 1233 if (sgp == SGP_DIRTY)
999 set_page_dirty(page); 1234 set_page_dirty(page);
1000 } 1235 }
1001done: 1236
1002 /* Perhaps the file has been truncated since we checked */ 1237 /* Perhaps the file has been truncated since we checked */
1003 if (sgp != SGP_WRITE && 1238 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1004 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1239 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1005 error = -EINVAL; 1240 error = -EINVAL;
1006 goto trunc; 1241 if (alloced)
1242 goto trunc;
1243 else
1244 goto failed;
1007 } 1245 }
1008 *pagep = page; 1246 *pagep = page;
1009 return 0; 1247 return 0;
@@ -1012,6 +1250,7 @@ done:
1012 * Error recovery. 1250 * Error recovery.
1013 */ 1251 */
1014trunc: 1252trunc:
1253 info = SHMEM_I(inode);
1015 ClearPageDirty(page); 1254 ClearPageDirty(page);
1016 delete_from_page_cache(page); 1255 delete_from_page_cache(page);
1017 spin_lock(&info->lock); 1256 spin_lock(&info->lock);
@@ -1019,19 +1258,16 @@ trunc:
1019 inode->i_blocks -= BLOCKS_PER_PAGE; 1258 inode->i_blocks -= BLOCKS_PER_PAGE;
1020 spin_unlock(&info->lock); 1259 spin_unlock(&info->lock);
1021decused: 1260decused:
1261 sbinfo = SHMEM_SB(inode->i_sb);
1022 if (sbinfo->max_blocks) 1262 if (sbinfo->max_blocks)
1023 percpu_counter_add(&sbinfo->used_blocks, -1); 1263 percpu_counter_add(&sbinfo->used_blocks, -1);
1024unacct: 1264unacct:
1025 shmem_unacct_blocks(info->flags, 1); 1265 shmem_unacct_blocks(info->flags, 1);
1026failed: 1266failed:
1027 if (swap.val && error != -EINVAL) { 1267 if (swap.val && error != -EINVAL &&
1028 struct page *test = find_get_page(mapping, index); 1268 !shmem_confirm_swap(mapping, index, swap))
1029 if (test && !radix_tree_exceptional_entry(test)) 1269 error = -EEXIST;
1030 page_cache_release(test); 1270unlock:
1031 /* Have another try if the entry has changed */
1032 if (test != swp_to_radix_entry(swap))
1033 error = -EEXIST;
1034 }
1035 if (page) { 1271 if (page) {
1036 unlock_page(page); 1272 unlock_page(page);
1037 page_cache_release(page); 1273 page_cache_release(page);
@@ -1043,7 +1279,7 @@ failed:
1043 spin_unlock(&info->lock); 1279 spin_unlock(&info->lock);
1044 goto repeat; 1280 goto repeat;
1045 } 1281 }
1046 if (error == -EEXIST) 1282 if (error == -EEXIST) /* from above or from radix_tree_insert */
1047 goto repeat; 1283 goto repeat;
1048 return error; 1284 return error;
1049} 1285}
@@ -1204,6 +1440,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1204 if (pos + copied > inode->i_size) 1440 if (pos + copied > inode->i_size)
1205 i_size_write(inode, pos + copied); 1441 i_size_write(inode, pos + copied);
1206 1442
1443 if (!PageUptodate(page)) {
1444 if (copied < PAGE_CACHE_SIZE) {
1445 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1446 zero_user_segments(page, 0, from,
1447 from + copied, PAGE_CACHE_SIZE);
1448 }
1449 SetPageUptodate(page);
1450 }
1207 set_page_dirty(page); 1451 set_page_dirty(page);
1208 unlock_page(page); 1452 unlock_page(page);
1209 page_cache_release(page); 1453 page_cache_release(page);
@@ -1365,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1365 struct splice_pipe_desc spd = { 1609 struct splice_pipe_desc spd = {
1366 .pages = pages, 1610 .pages = pages,
1367 .partial = partial, 1611 .partial = partial,
1612 .nr_pages_max = PIPE_DEF_BUFFERS,
1368 .flags = flags, 1613 .flags = flags,
1369 .ops = &page_cache_pipe_buf_ops, 1614 .ops = &page_cache_pipe_buf_ops,
1370 .spd_release = spd_release_page, 1615 .spd_release = spd_release_page,
@@ -1453,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1453 if (spd.nr_pages) 1698 if (spd.nr_pages)
1454 error = splice_to_pipe(pipe, &spd); 1699 error = splice_to_pipe(pipe, &spd);
1455 1700
1456 splice_shrink_spd(pipe, &spd); 1701 splice_shrink_spd(&spd);
1457 1702
1458 if (error > 0) { 1703 if (error > 0) {
1459 *ppos += error; 1704 *ppos += error;
@@ -1462,6 +1707,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1462 return error; 1707 return error;
1463} 1708}
1464 1709
1710static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1711 loff_t len)
1712{
1713 struct inode *inode = file->f_path.dentry->d_inode;
1714 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1715 struct shmem_falloc shmem_falloc;
1716 pgoff_t start, index, end;
1717 int error;
1718
1719 mutex_lock(&inode->i_mutex);
1720
1721 if (mode & FALLOC_FL_PUNCH_HOLE) {
1722 struct address_space *mapping = file->f_mapping;
1723 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1724 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1725
1726 if ((u64)unmap_end > (u64)unmap_start)
1727 unmap_mapping_range(mapping, unmap_start,
1728 1 + unmap_end - unmap_start, 0);
1729 shmem_truncate_range(inode, offset, offset + len - 1);
1730 /* No need to unmap again: hole-punching leaves COWed pages */
1731 error = 0;
1732 goto out;
1733 }
1734
1735 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1736 error = inode_newsize_ok(inode, offset + len);
1737 if (error)
1738 goto out;
1739
1740 start = offset >> PAGE_CACHE_SHIFT;
1741 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1742 /* Try to avoid a swapstorm if len is impossible to satisfy */
1743 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1744 error = -ENOSPC;
1745 goto out;
1746 }
1747
1748 shmem_falloc.start = start;
1749 shmem_falloc.next = start;
1750 shmem_falloc.nr_falloced = 0;
1751 shmem_falloc.nr_unswapped = 0;
1752 spin_lock(&inode->i_lock);
1753 inode->i_private = &shmem_falloc;
1754 spin_unlock(&inode->i_lock);
1755
1756 for (index = start; index < end; index++) {
1757 struct page *page;
1758
1759 /*
1760 * Good, the fallocate(2) manpage permits EINTR: we may have
1761 * been interrupted because we are using up too much memory.
1762 */
1763 if (signal_pending(current))
1764 error = -EINTR;
1765 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1766 error = -ENOMEM;
1767 else
1768 error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1769 NULL);
1770 if (error) {
1771 /* Remove the !PageUptodate pages we added */
1772 shmem_undo_range(inode,
1773 (loff_t)start << PAGE_CACHE_SHIFT,
1774 (loff_t)index << PAGE_CACHE_SHIFT, true);
1775 goto undone;
1776 }
1777
1778 /*
1779 * Inform shmem_writepage() how far we have reached.
1780 * No need for lock or barrier: we have the page lock.
1781 */
1782 shmem_falloc.next++;
1783 if (!PageUptodate(page))
1784 shmem_falloc.nr_falloced++;
1785
1786 /*
1787 * If !PageUptodate, leave it that way so that freeable pages
1788 * can be recognized if we need to rollback on error later.
1789 * But set_page_dirty so that memory pressure will swap rather
1790 * than free the pages we are allocating (and SGP_CACHE pages
1791 * might still be clean: we now need to mark those dirty too).
1792 */
1793 set_page_dirty(page);
1794 unlock_page(page);
1795 page_cache_release(page);
1796 cond_resched();
1797 }
1798
1799 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1800 i_size_write(inode, offset + len);
1801 inode->i_ctime = CURRENT_TIME;
1802undone:
1803 spin_lock(&inode->i_lock);
1804 inode->i_private = NULL;
1805 spin_unlock(&inode->i_lock);
1806out:
1807 mutex_unlock(&inode->i_mutex);
1808 return error;
1809}
1810
1465static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1811static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1466{ 1812{
1467 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1813 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2011,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1665 kaddr = kmap_atomic(page); 2011 kaddr = kmap_atomic(page);
1666 memcpy(kaddr, symname, len); 2012 memcpy(kaddr, symname, len);
1667 kunmap_atomic(kaddr); 2013 kunmap_atomic(kaddr);
2014 SetPageUptodate(page);
1668 set_page_dirty(page); 2015 set_page_dirty(page);
1669 unlock_page(page); 2016 unlock_page(page);
1670 page_cache_release(page); 2017 page_cache_release(page);
@@ -2033,11 +2380,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2033 return dentry; 2380 return dentry;
2034} 2381}
2035 2382
2036static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 2383static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2037 int connectable) 2384 struct inode *parent)
2038{ 2385{
2039 struct inode *inode = dentry->d_inode;
2040
2041 if (*len < 3) { 2386 if (*len < 3) {
2042 *len = 3; 2387 *len = 3;
2043 return 255; 2388 return 255;
@@ -2075,6 +2420,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2075 bool remount) 2420 bool remount)
2076{ 2421{
2077 char *this_char, *value, *rest; 2422 char *this_char, *value, *rest;
2423 uid_t uid;
2424 gid_t gid;
2078 2425
2079 while (options != NULL) { 2426 while (options != NULL) {
2080 this_char = options; 2427 this_char = options;
@@ -2134,15 +2481,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2134 } else if (!strcmp(this_char,"uid")) { 2481 } else if (!strcmp(this_char,"uid")) {
2135 if (remount) 2482 if (remount)
2136 continue; 2483 continue;
2137 sbinfo->uid = simple_strtoul(value, &rest, 0); 2484 uid = simple_strtoul(value, &rest, 0);
2138 if (*rest) 2485 if (*rest)
2139 goto bad_val; 2486 goto bad_val;
2487 sbinfo->uid = make_kuid(current_user_ns(), uid);
2488 if (!uid_valid(sbinfo->uid))
2489 goto bad_val;
2140 } else if (!strcmp(this_char,"gid")) { 2490 } else if (!strcmp(this_char,"gid")) {
2141 if (remount) 2491 if (remount)
2142 continue; 2492 continue;
2143 sbinfo->gid = simple_strtoul(value, &rest, 0); 2493 gid = simple_strtoul(value, &rest, 0);
2144 if (*rest) 2494 if (*rest)
2145 goto bad_val; 2495 goto bad_val;
2496 sbinfo->gid = make_kgid(current_user_ns(), gid);
2497 if (!gid_valid(sbinfo->gid))
2498 goto bad_val;
2146 } else if (!strcmp(this_char,"mpol")) { 2499 } else if (!strcmp(this_char,"mpol")) {
2147 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2500 if (mpol_parse_str(value, &sbinfo->mpol, 1))
2148 goto bad_val; 2501 goto bad_val;
@@ -2210,10 +2563,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2210 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2563 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2211 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2564 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2212 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 2565 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2213 if (sbinfo->uid != 0) 2566 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2214 seq_printf(seq, ",uid=%u", sbinfo->uid); 2567 seq_printf(seq, ",uid=%u",
2215 if (sbinfo->gid != 0) 2568 from_kuid_munged(&init_user_ns, sbinfo->uid));
2216 seq_printf(seq, ",gid=%u", sbinfo->gid); 2569 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2570 seq_printf(seq, ",gid=%u",
2571 from_kgid_munged(&init_user_ns, sbinfo->gid));
2217 shmem_show_mpol(seq, sbinfo->mpol); 2572 shmem_show_mpol(seq, sbinfo->mpol);
2218 return 0; 2573 return 0;
2219} 2574}
@@ -2260,6 +2615,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2260 } 2615 }
2261 } 2616 }
2262 sb->s_export_op = &shmem_export_ops; 2617 sb->s_export_op = &shmem_export_ops;
2618 sb->s_flags |= MS_NOSEC;
2263#else 2619#else
2264 sb->s_flags |= MS_NOUSER; 2620 sb->s_flags |= MS_NOUSER;
2265#endif 2621#endif
@@ -2362,12 +2718,12 @@ static const struct file_operations shmem_file_operations = {
2362 .fsync = noop_fsync, 2718 .fsync = noop_fsync,
2363 .splice_read = shmem_file_splice_read, 2719 .splice_read = shmem_file_splice_read,
2364 .splice_write = generic_file_splice_write, 2720 .splice_write = generic_file_splice_write,
2721 .fallocate = shmem_fallocate,
2365#endif 2722#endif
2366}; 2723};
2367 2724
2368static const struct inode_operations shmem_inode_operations = { 2725static const struct inode_operations shmem_inode_operations = {
2369 .setattr = shmem_setattr, 2726 .setattr = shmem_setattr,
2370 .truncate_range = shmem_truncate_range,
2371#ifdef CONFIG_TMPFS_XATTR 2727#ifdef CONFIG_TMPFS_XATTR
2372 .setxattr = shmem_setxattr, 2728 .setxattr = shmem_setxattr,
2373 .getxattr = shmem_getxattr, 2729 .getxattr = shmem_getxattr,