diff options
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 513 |
1 files changed, 460 insertions, 53 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 3711422c3172..585bd220a21e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; | |||
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | 54 | #include <linux/pagevec.h> |
55 | #include <linux/percpu_counter.h> | 55 | #include <linux/percpu_counter.h> |
56 | #include <linux/falloc.h> | ||
56 | #include <linux/splice.h> | 57 | #include <linux/splice.h> |
57 | #include <linux/security.h> | 58 | #include <linux/security.h> |
58 | #include <linux/swapops.h> | 59 | #include <linux/swapops.h> |
@@ -83,12 +84,25 @@ struct shmem_xattr { | |||
83 | char value[0]; | 84 | char value[0]; |
84 | }; | 85 | }; |
85 | 86 | ||
87 | /* | ||
88 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | ||
89 | * (with i_mutex making sure that it has only one user at a time): | ||
90 | * we would prefer not to enlarge the shmem inode just for that. | ||
91 | */ | ||
92 | struct shmem_falloc { | ||
93 | pgoff_t start; /* start of range currently being fallocated */ | ||
94 | pgoff_t next; /* the next page offset to be fallocated */ | ||
95 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | ||
96 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ | ||
97 | }; | ||
98 | |||
86 | /* Flag allocation requirements to shmem_getpage */ | 99 | /* Flag allocation requirements to shmem_getpage */ |
87 | enum sgp_type { | 100 | enum sgp_type { |
88 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 101 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 102 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
90 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ | 103 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ |
91 | SGP_WRITE, /* may exceed i_size, may allocate page */ | 104 | SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ |
105 | SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ | ||
92 | }; | 106 | }; |
93 | 107 | ||
94 | #ifdef CONFIG_TMPFS | 108 | #ifdef CONFIG_TMPFS |
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void) | |||
103 | } | 117 | } |
104 | #endif | 118 | #endif |
105 | 119 | ||
120 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | ||
121 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
122 | struct shmem_inode_info *info, pgoff_t index); | ||
106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 123 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); | 124 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
108 | 125 | ||
@@ -423,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
423 | 440 | ||
424 | /* | 441 | /* |
425 | * Remove range of pages and swap entries from radix tree, and free them. | 442 | * Remove range of pages and swap entries from radix tree, and free them. |
443 | * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. | ||
426 | */ | 444 | */ |
427 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 445 | static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, |
446 | bool unfalloc) | ||
428 | { | 447 | { |
429 | struct address_space *mapping = inode->i_mapping; | 448 | struct address_space *mapping = inode->i_mapping; |
430 | struct shmem_inode_info *info = SHMEM_I(inode); | 449 | struct shmem_inode_info *info = SHMEM_I(inode); |
431 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 450 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
432 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 451 | pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; |
433 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); | 452 | unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); |
453 | unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); | ||
434 | struct pagevec pvec; | 454 | struct pagevec pvec; |
435 | pgoff_t indices[PAGEVEC_SIZE]; | 455 | pgoff_t indices[PAGEVEC_SIZE]; |
436 | long nr_swaps_freed = 0; | 456 | long nr_swaps_freed = 0; |
437 | pgoff_t index; | 457 | pgoff_t index; |
438 | int i; | 458 | int i; |
439 | 459 | ||
440 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | 460 | if (lend == -1) |
461 | end = -1; /* unsigned, so actually very big */ | ||
441 | 462 | ||
442 | pagevec_init(&pvec, 0); | 463 | pagevec_init(&pvec, 0); |
443 | index = start; | 464 | index = start; |
444 | while (index <= end) { | 465 | while (index < end) { |
445 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 466 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
446 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 467 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
447 | pvec.pages, indices); | 468 | pvec.pages, indices); |
448 | if (!pvec.nr) | 469 | if (!pvec.nr) |
449 | break; | 470 | break; |
@@ -452,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
452 | struct page *page = pvec.pages[i]; | 473 | struct page *page = pvec.pages[i]; |
453 | 474 | ||
454 | index = indices[i]; | 475 | index = indices[i]; |
455 | if (index > end) | 476 | if (index >= end) |
456 | break; | 477 | break; |
457 | 478 | ||
458 | if (radix_tree_exceptional_entry(page)) { | 479 | if (radix_tree_exceptional_entry(page)) { |
480 | if (unfalloc) | ||
481 | continue; | ||
459 | nr_swaps_freed += !shmem_free_swap(mapping, | 482 | nr_swaps_freed += !shmem_free_swap(mapping, |
460 | index, page); | 483 | index, page); |
461 | continue; | 484 | continue; |
@@ -463,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
463 | 486 | ||
464 | if (!trylock_page(page)) | 487 | if (!trylock_page(page)) |
465 | continue; | 488 | continue; |
466 | if (page->mapping == mapping) { | 489 | if (!unfalloc || !PageUptodate(page)) { |
467 | VM_BUG_ON(PageWriteback(page)); | 490 | if (page->mapping == mapping) { |
468 | truncate_inode_page(mapping, page); | 491 | VM_BUG_ON(PageWriteback(page)); |
492 | truncate_inode_page(mapping, page); | ||
493 | } | ||
469 | } | 494 | } |
470 | unlock_page(page); | 495 | unlock_page(page); |
471 | } | 496 | } |
@@ -476,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
476 | index++; | 501 | index++; |
477 | } | 502 | } |
478 | 503 | ||
479 | if (partial) { | 504 | if (partial_start) { |
480 | struct page *page = NULL; | 505 | struct page *page = NULL; |
481 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); | 506 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
482 | if (page) { | 507 | if (page) { |
483 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 508 | unsigned int top = PAGE_CACHE_SIZE; |
509 | if (start > end) { | ||
510 | top = partial_end; | ||
511 | partial_end = 0; | ||
512 | } | ||
513 | zero_user_segment(page, partial_start, top); | ||
514 | set_page_dirty(page); | ||
515 | unlock_page(page); | ||
516 | page_cache_release(page); | ||
517 | } | ||
518 | } | ||
519 | if (partial_end) { | ||
520 | struct page *page = NULL; | ||
521 | shmem_getpage(inode, end, &page, SGP_READ, NULL); | ||
522 | if (page) { | ||
523 | zero_user_segment(page, 0, partial_end); | ||
484 | set_page_dirty(page); | 524 | set_page_dirty(page); |
485 | unlock_page(page); | 525 | unlock_page(page); |
486 | page_cache_release(page); | 526 | page_cache_release(page); |
487 | } | 527 | } |
488 | } | 528 | } |
529 | if (start >= end) | ||
530 | return; | ||
489 | 531 | ||
490 | index = start; | 532 | index = start; |
491 | for ( ; ; ) { | 533 | for ( ; ; ) { |
492 | cond_resched(); | 534 | cond_resched(); |
493 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 535 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
494 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 536 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
495 | pvec.pages, indices); | 537 | pvec.pages, indices); |
496 | if (!pvec.nr) { | 538 | if (!pvec.nr) { |
497 | if (index == start) | 539 | if (index == start || unfalloc) |
498 | break; | 540 | break; |
499 | index = start; | 541 | index = start; |
500 | continue; | 542 | continue; |
501 | } | 543 | } |
502 | if (index == start && indices[0] > end) { | 544 | if ((index == start || unfalloc) && indices[0] >= end) { |
503 | shmem_deswap_pagevec(&pvec); | 545 | shmem_deswap_pagevec(&pvec); |
504 | pagevec_release(&pvec); | 546 | pagevec_release(&pvec); |
505 | break; | 547 | break; |
@@ -509,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
509 | struct page *page = pvec.pages[i]; | 551 | struct page *page = pvec.pages[i]; |
510 | 552 | ||
511 | index = indices[i]; | 553 | index = indices[i]; |
512 | if (index > end) | 554 | if (index >= end) |
513 | break; | 555 | break; |
514 | 556 | ||
515 | if (radix_tree_exceptional_entry(page)) { | 557 | if (radix_tree_exceptional_entry(page)) { |
558 | if (unfalloc) | ||
559 | continue; | ||
516 | nr_swaps_freed += !shmem_free_swap(mapping, | 560 | nr_swaps_freed += !shmem_free_swap(mapping, |
517 | index, page); | 561 | index, page); |
518 | continue; | 562 | continue; |
519 | } | 563 | } |
520 | 564 | ||
521 | lock_page(page); | 565 | lock_page(page); |
522 | if (page->mapping == mapping) { | 566 | if (!unfalloc || !PageUptodate(page)) { |
523 | VM_BUG_ON(PageWriteback(page)); | 567 | if (page->mapping == mapping) { |
524 | truncate_inode_page(mapping, page); | 568 | VM_BUG_ON(PageWriteback(page)); |
569 | truncate_inode_page(mapping, page); | ||
570 | } | ||
525 | } | 571 | } |
526 | unlock_page(page); | 572 | unlock_page(page); |
527 | } | 573 | } |
@@ -535,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
535 | info->swapped -= nr_swaps_freed; | 581 | info->swapped -= nr_swaps_freed; |
536 | shmem_recalc_inode(inode); | 582 | shmem_recalc_inode(inode); |
537 | spin_unlock(&info->lock); | 583 | spin_unlock(&info->lock); |
584 | } | ||
538 | 585 | ||
586 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
587 | { | ||
588 | shmem_undo_range(inode, lstart, lend, false); | ||
539 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 589 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
540 | } | 590 | } |
541 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 591 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
@@ -604,12 +654,13 @@ static void shmem_evict_inode(struct inode *inode) | |||
604 | * If swap found in inode, free it and move page from swapcache to filecache. | 654 | * If swap found in inode, free it and move page from swapcache to filecache. |
605 | */ | 655 | */ |
606 | static int shmem_unuse_inode(struct shmem_inode_info *info, | 656 | static int shmem_unuse_inode(struct shmem_inode_info *info, |
607 | swp_entry_t swap, struct page *page) | 657 | swp_entry_t swap, struct page **pagep) |
608 | { | 658 | { |
609 | struct address_space *mapping = info->vfs_inode.i_mapping; | 659 | struct address_space *mapping = info->vfs_inode.i_mapping; |
610 | void *radswap; | 660 | void *radswap; |
611 | pgoff_t index; | 661 | pgoff_t index; |
612 | int error; | 662 | gfp_t gfp; |
663 | int error = 0; | ||
613 | 664 | ||
614 | radswap = swp_to_radix_entry(swap); | 665 | radswap = swp_to_radix_entry(swap); |
615 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 666 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
@@ -625,22 +676,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
625 | if (shmem_swaplist.next != &info->swaplist) | 676 | if (shmem_swaplist.next != &info->swaplist) |
626 | list_move_tail(&shmem_swaplist, &info->swaplist); | 677 | list_move_tail(&shmem_swaplist, &info->swaplist); |
627 | 678 | ||
679 | gfp = mapping_gfp_mask(mapping); | ||
680 | if (shmem_should_replace_page(*pagep, gfp)) { | ||
681 | mutex_unlock(&shmem_swaplist_mutex); | ||
682 | error = shmem_replace_page(pagep, gfp, info, index); | ||
683 | mutex_lock(&shmem_swaplist_mutex); | ||
684 | /* | ||
685 | * We needed to drop mutex to make that restrictive page | ||
686 | * allocation; but the inode might already be freed by now, | ||
687 | * and we cannot refer to inode or mapping or info to check. | ||
688 | * However, we do hold page lock on the PageSwapCache page, | ||
689 | * so can check if that still has our reference remaining. | ||
690 | */ | ||
691 | if (!page_swapcount(*pagep)) | ||
692 | error = -ENOENT; | ||
693 | } | ||
694 | |||
628 | /* | 695 | /* |
629 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | 696 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
630 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 697 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
631 | * beneath us (pagelock doesn't help until the page is in pagecache). | 698 | * beneath us (pagelock doesn't help until the page is in pagecache). |
632 | */ | 699 | */ |
633 | error = shmem_add_to_page_cache(page, mapping, index, | 700 | if (!error) |
701 | error = shmem_add_to_page_cache(*pagep, mapping, index, | ||
634 | GFP_NOWAIT, radswap); | 702 | GFP_NOWAIT, radswap); |
635 | /* which does mem_cgroup_uncharge_cache_page on error */ | ||
636 | |||
637 | if (error != -ENOMEM) { | 703 | if (error != -ENOMEM) { |
638 | /* | 704 | /* |
639 | * Truncation and eviction use free_swap_and_cache(), which | 705 | * Truncation and eviction use free_swap_and_cache(), which |
640 | * only does trylock page: if we raced, best clean up here. | 706 | * only does trylock page: if we raced, best clean up here. |
641 | */ | 707 | */ |
642 | delete_from_swap_cache(page); | 708 | delete_from_swap_cache(*pagep); |
643 | set_page_dirty(page); | 709 | set_page_dirty(*pagep); |
644 | if (!error) { | 710 | if (!error) { |
645 | spin_lock(&info->lock); | 711 | spin_lock(&info->lock); |
646 | info->swapped--; | 712 | info->swapped--; |
@@ -660,7 +726,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
660 | struct list_head *this, *next; | 726 | struct list_head *this, *next; |
661 | struct shmem_inode_info *info; | 727 | struct shmem_inode_info *info; |
662 | int found = 0; | 728 | int found = 0; |
663 | int error; | 729 | int error = 0; |
730 | |||
731 | /* | ||
732 | * There's a faint possibility that swap page was replaced before | ||
733 | * caller locked it: it will come back later with the right page. | ||
734 | */ | ||
735 | if (unlikely(!PageSwapCache(page))) | ||
736 | goto out; | ||
664 | 737 | ||
665 | /* | 738 | /* |
666 | * Charge page using GFP_KERNEL while we can wait, before taking | 739 | * Charge page using GFP_KERNEL while we can wait, before taking |
@@ -676,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
676 | list_for_each_safe(this, next, &shmem_swaplist) { | 749 | list_for_each_safe(this, next, &shmem_swaplist) { |
677 | info = list_entry(this, struct shmem_inode_info, swaplist); | 750 | info = list_entry(this, struct shmem_inode_info, swaplist); |
678 | if (info->swapped) | 751 | if (info->swapped) |
679 | found = shmem_unuse_inode(info, swap, page); | 752 | found = shmem_unuse_inode(info, swap, &page); |
680 | else | 753 | else |
681 | list_del_init(&info->swaplist); | 754 | list_del_init(&info->swaplist); |
682 | cond_resched(); | 755 | cond_resched(); |
@@ -685,8 +758,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
685 | } | 758 | } |
686 | mutex_unlock(&shmem_swaplist_mutex); | 759 | mutex_unlock(&shmem_swaplist_mutex); |
687 | 760 | ||
688 | if (!found) | ||
689 | mem_cgroup_uncharge_cache_page(page); | ||
690 | if (found < 0) | 761 | if (found < 0) |
691 | error = found; | 762 | error = found; |
692 | out: | 763 | out: |
@@ -727,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
727 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | 798 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
728 | goto redirty; | 799 | goto redirty; |
729 | } | 800 | } |
801 | |||
802 | /* | ||
803 | * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC | ||
804 | * value into swapfile.c, the only way we can correctly account for a | ||
805 | * fallocated page arriving here is now to initialize it and write it. | ||
806 | * | ||
807 | * That's okay for a page already fallocated earlier, but if we have | ||
808 | * not yet completed the fallocation, then (a) we want to keep track | ||
809 | * of this page in case we have to undo it, and (b) it may not be a | ||
810 | * good idea to continue anyway, once we're pushing into swap. So | ||
811 | * reactivate the page, and let shmem_fallocate() quit when too many. | ||
812 | */ | ||
813 | if (!PageUptodate(page)) { | ||
814 | if (inode->i_private) { | ||
815 | struct shmem_falloc *shmem_falloc; | ||
816 | spin_lock(&inode->i_lock); | ||
817 | shmem_falloc = inode->i_private; | ||
818 | if (shmem_falloc && | ||
819 | index >= shmem_falloc->start && | ||
820 | index < shmem_falloc->next) | ||
821 | shmem_falloc->nr_unswapped++; | ||
822 | else | ||
823 | shmem_falloc = NULL; | ||
824 | spin_unlock(&inode->i_lock); | ||
825 | if (shmem_falloc) | ||
826 | goto redirty; | ||
827 | } | ||
828 | clear_highpage(page); | ||
829 | flush_dcache_page(page); | ||
830 | SetPageUptodate(page); | ||
831 | } | ||
832 | |||
730 | swap = get_swap_page(); | 833 | swap = get_swap_page(); |
731 | if (!swap.val) | 834 | if (!swap.val) |
732 | goto redirty; | 835 | goto redirty; |
@@ -856,6 +959,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
856 | #endif | 959 | #endif |
857 | 960 | ||
858 | /* | 961 | /* |
962 | * When a page is moved from swapcache to shmem filecache (either by the | ||
963 | * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of | ||
964 | * shmem_unuse_inode()), it may have been read in earlier from swap, in | ||
965 | * ignorance of the mapping it belongs to. If that mapping has special | ||
966 | * constraints (like the gma500 GEM driver, which requires RAM below 4GB), | ||
967 | * we may need to copy to a suitable page before moving to filecache. | ||
968 | * | ||
969 | * In a future release, this may well be extended to respect cpuset and | ||
970 | * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); | ||
971 | * but for now it is a simple matter of zone. | ||
972 | */ | ||
973 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp) | ||
974 | { | ||
975 | return page_zonenum(page) > gfp_zone(gfp); | ||
976 | } | ||
977 | |||
978 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
979 | struct shmem_inode_info *info, pgoff_t index) | ||
980 | { | ||
981 | struct page *oldpage, *newpage; | ||
982 | struct address_space *swap_mapping; | ||
983 | pgoff_t swap_index; | ||
984 | int error; | ||
985 | |||
986 | oldpage = *pagep; | ||
987 | swap_index = page_private(oldpage); | ||
988 | swap_mapping = page_mapping(oldpage); | ||
989 | |||
990 | /* | ||
991 | * We have arrived here because our zones are constrained, so don't | ||
992 | * limit chance of success by further cpuset and node constraints. | ||
993 | */ | ||
994 | gfp &= ~GFP_CONSTRAINT_MASK; | ||
995 | newpage = shmem_alloc_page(gfp, info, index); | ||
996 | if (!newpage) | ||
997 | return -ENOMEM; | ||
998 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
999 | |||
1000 | *pagep = newpage; | ||
1001 | page_cache_get(newpage); | ||
1002 | copy_highpage(newpage, oldpage); | ||
1003 | |||
1004 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1005 | __set_page_locked(newpage); | ||
1006 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
1007 | SetPageUptodate(newpage); | ||
1008 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
1009 | SetPageSwapBacked(newpage); | ||
1010 | VM_BUG_ON(!swap_index); | ||
1011 | set_page_private(newpage, swap_index); | ||
1012 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
1013 | SetPageSwapCache(newpage); | ||
1014 | |||
1015 | /* | ||
1016 | * Our caller will very soon move newpage out of swapcache, but it's | ||
1017 | * a nice clean interface for us to replace oldpage by newpage there. | ||
1018 | */ | ||
1019 | spin_lock_irq(&swap_mapping->tree_lock); | ||
1020 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | ||
1021 | newpage); | ||
1022 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | ||
1023 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1024 | spin_unlock_irq(&swap_mapping->tree_lock); | ||
1025 | BUG_ON(error); | ||
1026 | |||
1027 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1028 | lru_cache_add_anon(newpage); | ||
1029 | |||
1030 | ClearPageSwapCache(oldpage); | ||
1031 | set_page_private(oldpage, 0); | ||
1032 | |||
1033 | unlock_page(oldpage); | ||
1034 | page_cache_release(oldpage); | ||
1035 | page_cache_release(oldpage); | ||
1036 | return 0; | ||
1037 | } | ||
1038 | |||
1039 | /* | ||
859 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate | 1040 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
860 | * | 1041 | * |
861 | * If we allocate a new one we do not mark it dirty. That's up to the | 1042 | * If we allocate a new one we do not mark it dirty. That's up to the |
@@ -872,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
872 | swp_entry_t swap; | 1053 | swp_entry_t swap; |
873 | int error; | 1054 | int error; |
874 | int once = 0; | 1055 | int once = 0; |
1056 | int alloced = 0; | ||
875 | 1057 | ||
876 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) | 1058 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
877 | return -EFBIG; | 1059 | return -EFBIG; |
@@ -883,19 +1065,21 @@ repeat: | |||
883 | page = NULL; | 1065 | page = NULL; |
884 | } | 1066 | } |
885 | 1067 | ||
886 | if (sgp != SGP_WRITE && | 1068 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
887 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1069 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
888 | error = -EINVAL; | 1070 | error = -EINVAL; |
889 | goto failed; | 1071 | goto failed; |
890 | } | 1072 | } |
891 | 1073 | ||
1074 | /* fallocated page? */ | ||
1075 | if (page && !PageUptodate(page)) { | ||
1076 | if (sgp != SGP_READ) | ||
1077 | goto clear; | ||
1078 | unlock_page(page); | ||
1079 | page_cache_release(page); | ||
1080 | page = NULL; | ||
1081 | } | ||
892 | if (page || (sgp == SGP_READ && !swap.val)) { | 1082 | if (page || (sgp == SGP_READ && !swap.val)) { |
893 | /* | ||
894 | * Once we can get the page lock, it must be uptodate: | ||
895 | * if there were an error in reading back from swap, | ||
896 | * the page would not be inserted into the filecache. | ||
897 | */ | ||
898 | BUG_ON(page && !PageUptodate(page)); | ||
899 | *pagep = page; | 1083 | *pagep = page; |
900 | return 0; | 1084 | return 0; |
901 | } | 1085 | } |
@@ -923,19 +1107,20 @@ repeat: | |||
923 | 1107 | ||
924 | /* We have to do this with page locked to prevent races */ | 1108 | /* We have to do this with page locked to prevent races */ |
925 | lock_page(page); | 1109 | lock_page(page); |
1110 | if (!PageSwapCache(page) || page->mapping) { | ||
1111 | error = -EEXIST; /* try again */ | ||
1112 | goto failed; | ||
1113 | } | ||
926 | if (!PageUptodate(page)) { | 1114 | if (!PageUptodate(page)) { |
927 | error = -EIO; | 1115 | error = -EIO; |
928 | goto failed; | 1116 | goto failed; |
929 | } | 1117 | } |
930 | wait_on_page_writeback(page); | 1118 | wait_on_page_writeback(page); |
931 | 1119 | ||
932 | /* Someone may have already done it for us */ | 1120 | if (shmem_should_replace_page(page, gfp)) { |
933 | if (page->mapping) { | 1121 | error = shmem_replace_page(&page, gfp, info, index); |
934 | if (page->mapping == mapping && | 1122 | if (error) |
935 | page->index == index) | 1123 | goto failed; |
936 | goto done; | ||
937 | error = -EEXIST; | ||
938 | goto failed; | ||
939 | } | 1124 | } |
940 | 1125 | ||
941 | error = mem_cgroup_cache_charge(page, current->mm, | 1126 | error = mem_cgroup_cache_charge(page, current->mm, |
@@ -991,19 +1176,36 @@ repeat: | |||
991 | inode->i_blocks += BLOCKS_PER_PAGE; | 1176 | inode->i_blocks += BLOCKS_PER_PAGE; |
992 | shmem_recalc_inode(inode); | 1177 | shmem_recalc_inode(inode); |
993 | spin_unlock(&info->lock); | 1178 | spin_unlock(&info->lock); |
1179 | alloced = true; | ||
994 | 1180 | ||
995 | clear_highpage(page); | 1181 | /* |
996 | flush_dcache_page(page); | 1182 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
997 | SetPageUptodate(page); | 1183 | */ |
1184 | if (sgp == SGP_FALLOC) | ||
1185 | sgp = SGP_WRITE; | ||
1186 | clear: | ||
1187 | /* | ||
1188 | * Let SGP_WRITE caller clear ends if write does not fill page; | ||
1189 | * but SGP_FALLOC on a page fallocated earlier must initialize | ||
1190 | * it now, lest undo on failure cancel our earlier guarantee. | ||
1191 | */ | ||
1192 | if (sgp != SGP_WRITE) { | ||
1193 | clear_highpage(page); | ||
1194 | flush_dcache_page(page); | ||
1195 | SetPageUptodate(page); | ||
1196 | } | ||
998 | if (sgp == SGP_DIRTY) | 1197 | if (sgp == SGP_DIRTY) |
999 | set_page_dirty(page); | 1198 | set_page_dirty(page); |
1000 | } | 1199 | } |
1001 | done: | 1200 | |
1002 | /* Perhaps the file has been truncated since we checked */ | 1201 | /* Perhaps the file has been truncated since we checked */ |
1003 | if (sgp != SGP_WRITE && | 1202 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
1004 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1203 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1005 | error = -EINVAL; | 1204 | error = -EINVAL; |
1006 | goto trunc; | 1205 | if (alloced) |
1206 | goto trunc; | ||
1207 | else | ||
1208 | goto failed; | ||
1007 | } | 1209 | } |
1008 | *pagep = page; | 1210 | *pagep = page; |
1009 | return 0; | 1211 | return 0; |
@@ -1012,6 +1214,7 @@ done: | |||
1012 | * Error recovery. | 1214 | * Error recovery. |
1013 | */ | 1215 | */ |
1014 | trunc: | 1216 | trunc: |
1217 | info = SHMEM_I(inode); | ||
1015 | ClearPageDirty(page); | 1218 | ClearPageDirty(page); |
1016 | delete_from_page_cache(page); | 1219 | delete_from_page_cache(page); |
1017 | spin_lock(&info->lock); | 1220 | spin_lock(&info->lock); |
@@ -1019,6 +1222,7 @@ trunc: | |||
1019 | inode->i_blocks -= BLOCKS_PER_PAGE; | 1222 | inode->i_blocks -= BLOCKS_PER_PAGE; |
1020 | spin_unlock(&info->lock); | 1223 | spin_unlock(&info->lock); |
1021 | decused: | 1224 | decused: |
1225 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1022 | if (sbinfo->max_blocks) | 1226 | if (sbinfo->max_blocks) |
1023 | percpu_counter_add(&sbinfo->used_blocks, -1); | 1227 | percpu_counter_add(&sbinfo->used_blocks, -1); |
1024 | unacct: | 1228 | unacct: |
@@ -1204,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1204 | if (pos + copied > inode->i_size) | 1408 | if (pos + copied > inode->i_size) |
1205 | i_size_write(inode, pos + copied); | 1409 | i_size_write(inode, pos + copied); |
1206 | 1410 | ||
1411 | if (!PageUptodate(page)) { | ||
1412 | if (copied < PAGE_CACHE_SIZE) { | ||
1413 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||
1414 | zero_user_segments(page, 0, from, | ||
1415 | from + copied, PAGE_CACHE_SIZE); | ||
1416 | } | ||
1417 | SetPageUptodate(page); | ||
1418 | } | ||
1207 | set_page_dirty(page); | 1419 | set_page_dirty(page); |
1208 | unlock_page(page); | 1420 | unlock_page(page); |
1209 | page_cache_release(page); | 1421 | page_cache_release(page); |
@@ -1462,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1462 | return error; | 1674 | return error; |
1463 | } | 1675 | } |
1464 | 1676 | ||
1677 | /* | ||
1678 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1679 | */ | ||
1680 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1681 | pgoff_t index, pgoff_t end, int origin) | ||
1682 | { | ||
1683 | struct page *page; | ||
1684 | struct pagevec pvec; | ||
1685 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1686 | bool done = false; | ||
1687 | int i; | ||
1688 | |||
1689 | pagevec_init(&pvec, 0); | ||
1690 | pvec.nr = 1; /* start small: we may be there already */ | ||
1691 | while (!done) { | ||
1692 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1693 | pvec.nr, pvec.pages, indices); | ||
1694 | if (!pvec.nr) { | ||
1695 | if (origin == SEEK_DATA) | ||
1696 | index = end; | ||
1697 | break; | ||
1698 | } | ||
1699 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1700 | if (index < indices[i]) { | ||
1701 | if (origin == SEEK_HOLE) { | ||
1702 | done = true; | ||
1703 | break; | ||
1704 | } | ||
1705 | index = indices[i]; | ||
1706 | } | ||
1707 | page = pvec.pages[i]; | ||
1708 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1709 | if (!PageUptodate(page)) | ||
1710 | page = NULL; | ||
1711 | } | ||
1712 | if (index >= end || | ||
1713 | (page && origin == SEEK_DATA) || | ||
1714 | (!page && origin == SEEK_HOLE)) { | ||
1715 | done = true; | ||
1716 | break; | ||
1717 | } | ||
1718 | } | ||
1719 | shmem_deswap_pagevec(&pvec); | ||
1720 | pagevec_release(&pvec); | ||
1721 | pvec.nr = PAGEVEC_SIZE; | ||
1722 | cond_resched(); | ||
1723 | } | ||
1724 | return index; | ||
1725 | } | ||
1726 | |||
1727 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
1728 | { | ||
1729 | struct address_space *mapping; | ||
1730 | struct inode *inode; | ||
1731 | pgoff_t start, end; | ||
1732 | loff_t new_offset; | ||
1733 | |||
1734 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
1735 | return generic_file_llseek_size(file, offset, origin, | ||
1736 | MAX_LFS_FILESIZE); | ||
1737 | mapping = file->f_mapping; | ||
1738 | inode = mapping->host; | ||
1739 | mutex_lock(&inode->i_mutex); | ||
1740 | /* We're holding i_mutex so we can access i_size directly */ | ||
1741 | |||
1742 | if (offset < 0) | ||
1743 | offset = -EINVAL; | ||
1744 | else if (offset >= inode->i_size) | ||
1745 | offset = -ENXIO; | ||
1746 | else { | ||
1747 | start = offset >> PAGE_CACHE_SHIFT; | ||
1748 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1749 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
1750 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1751 | if (new_offset > offset) { | ||
1752 | if (new_offset < inode->i_size) | ||
1753 | offset = new_offset; | ||
1754 | else if (origin == SEEK_DATA) | ||
1755 | offset = -ENXIO; | ||
1756 | else | ||
1757 | offset = inode->i_size; | ||
1758 | } | ||
1759 | } | ||
1760 | |||
1761 | if (offset >= 0 && offset != file->f_pos) { | ||
1762 | file->f_pos = offset; | ||
1763 | file->f_version = 0; | ||
1764 | } | ||
1765 | mutex_unlock(&inode->i_mutex); | ||
1766 | return offset; | ||
1767 | } | ||
1768 | |||
1769 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | ||
1770 | loff_t len) | ||
1771 | { | ||
1772 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1773 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
1774 | struct shmem_falloc shmem_falloc; | ||
1775 | pgoff_t start, index, end; | ||
1776 | int error; | ||
1777 | |||
1778 | mutex_lock(&inode->i_mutex); | ||
1779 | |||
1780 | if (mode & FALLOC_FL_PUNCH_HOLE) { | ||
1781 | struct address_space *mapping = file->f_mapping; | ||
1782 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | ||
1783 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | ||
1784 | |||
1785 | if ((u64)unmap_end > (u64)unmap_start) | ||
1786 | unmap_mapping_range(mapping, unmap_start, | ||
1787 | 1 + unmap_end - unmap_start, 0); | ||
1788 | shmem_truncate_range(inode, offset, offset + len - 1); | ||
1789 | /* No need to unmap again: hole-punching leaves COWed pages */ | ||
1790 | error = 0; | ||
1791 | goto out; | ||
1792 | } | ||
1793 | |||
1794 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | ||
1795 | error = inode_newsize_ok(inode, offset + len); | ||
1796 | if (error) | ||
1797 | goto out; | ||
1798 | |||
1799 | start = offset >> PAGE_CACHE_SHIFT; | ||
1800 | end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1801 | /* Try to avoid a swapstorm if len is impossible to satisfy */ | ||
1802 | if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { | ||
1803 | error = -ENOSPC; | ||
1804 | goto out; | ||
1805 | } | ||
1806 | |||
1807 | shmem_falloc.start = start; | ||
1808 | shmem_falloc.next = start; | ||
1809 | shmem_falloc.nr_falloced = 0; | ||
1810 | shmem_falloc.nr_unswapped = 0; | ||
1811 | spin_lock(&inode->i_lock); | ||
1812 | inode->i_private = &shmem_falloc; | ||
1813 | spin_unlock(&inode->i_lock); | ||
1814 | |||
1815 | for (index = start; index < end; index++) { | ||
1816 | struct page *page; | ||
1817 | |||
1818 | /* | ||
1819 | * Good, the fallocate(2) manpage permits EINTR: we may have | ||
1820 | * been interrupted because we are using up too much memory. | ||
1821 | */ | ||
1822 | if (signal_pending(current)) | ||
1823 | error = -EINTR; | ||
1824 | else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) | ||
1825 | error = -ENOMEM; | ||
1826 | else | ||
1827 | error = shmem_getpage(inode, index, &page, SGP_FALLOC, | ||
1828 | NULL); | ||
1829 | if (error) { | ||
1830 | /* Remove the !PageUptodate pages we added */ | ||
1831 | shmem_undo_range(inode, | ||
1832 | (loff_t)start << PAGE_CACHE_SHIFT, | ||
1833 | (loff_t)index << PAGE_CACHE_SHIFT, true); | ||
1834 | goto undone; | ||
1835 | } | ||
1836 | |||
1837 | /* | ||
1838 | * Inform shmem_writepage() how far we have reached. | ||
1839 | * No need for lock or barrier: we have the page lock. | ||
1840 | */ | ||
1841 | shmem_falloc.next++; | ||
1842 | if (!PageUptodate(page)) | ||
1843 | shmem_falloc.nr_falloced++; | ||
1844 | |||
1845 | /* | ||
1846 | * If !PageUptodate, leave it that way so that freeable pages | ||
1847 | * can be recognized if we need to rollback on error later. | ||
1848 | * But set_page_dirty so that memory pressure will swap rather | ||
1849 | * than free the pages we are allocating (and SGP_CACHE pages | ||
1850 | * might still be clean: we now need to mark those dirty too). | ||
1851 | */ | ||
1852 | set_page_dirty(page); | ||
1853 | unlock_page(page); | ||
1854 | page_cache_release(page); | ||
1855 | cond_resched(); | ||
1856 | } | ||
1857 | |||
1858 | if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) | ||
1859 | i_size_write(inode, offset + len); | ||
1860 | inode->i_ctime = CURRENT_TIME; | ||
1861 | undone: | ||
1862 | spin_lock(&inode->i_lock); | ||
1863 | inode->i_private = NULL; | ||
1864 | spin_unlock(&inode->i_lock); | ||
1865 | out: | ||
1866 | mutex_unlock(&inode->i_mutex); | ||
1867 | return error; | ||
1868 | } | ||
1869 | |||
1465 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1870 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1466 | { | 1871 | { |
1467 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1872 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -1665,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1665 | kaddr = kmap_atomic(page); | 2070 | kaddr = kmap_atomic(page); |
1666 | memcpy(kaddr, symname, len); | 2071 | memcpy(kaddr, symname, len); |
1667 | kunmap_atomic(kaddr); | 2072 | kunmap_atomic(kaddr); |
2073 | SetPageUptodate(page); | ||
1668 | set_page_dirty(page); | 2074 | set_page_dirty(page); |
1669 | unlock_page(page); | 2075 | unlock_page(page); |
1670 | page_cache_release(page); | 2076 | page_cache_release(page); |
@@ -2268,6 +2674,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2268 | } | 2674 | } |
2269 | } | 2675 | } |
2270 | sb->s_export_op = &shmem_export_ops; | 2676 | sb->s_export_op = &shmem_export_ops; |
2677 | sb->s_flags |= MS_NOSEC; | ||
2271 | #else | 2678 | #else |
2272 | sb->s_flags |= MS_NOUSER; | 2679 | sb->s_flags |= MS_NOUSER; |
2273 | #endif | 2680 | #endif |
@@ -2362,7 +2769,7 @@ static const struct address_space_operations shmem_aops = { | |||
2362 | static const struct file_operations shmem_file_operations = { | 2769 | static const struct file_operations shmem_file_operations = { |
2363 | .mmap = shmem_mmap, | 2770 | .mmap = shmem_mmap, |
2364 | #ifdef CONFIG_TMPFS | 2771 | #ifdef CONFIG_TMPFS |
2365 | .llseek = generic_file_llseek, | 2772 | .llseek = shmem_file_llseek, |
2366 | .read = do_sync_read, | 2773 | .read = do_sync_read, |
2367 | .write = do_sync_write, | 2774 | .write = do_sync_write, |
2368 | .aio_read = shmem_file_aio_read, | 2775 | .aio_read = shmem_file_aio_read, |
@@ -2370,12 +2777,12 @@ static const struct file_operations shmem_file_operations = { | |||
2370 | .fsync = noop_fsync, | 2777 | .fsync = noop_fsync, |
2371 | .splice_read = shmem_file_splice_read, | 2778 | .splice_read = shmem_file_splice_read, |
2372 | .splice_write = generic_file_splice_write, | 2779 | .splice_write = generic_file_splice_write, |
2780 | .fallocate = shmem_fallocate, | ||
2373 | #endif | 2781 | #endif |
2374 | }; | 2782 | }; |
2375 | 2783 | ||
2376 | static const struct inode_operations shmem_inode_operations = { | 2784 | static const struct inode_operations shmem_inode_operations = { |
2377 | .setattr = shmem_setattr, | 2785 | .setattr = shmem_setattr, |
2378 | .truncate_range = shmem_truncate_range, | ||
2379 | #ifdef CONFIG_TMPFS_XATTR | 2786 | #ifdef CONFIG_TMPFS_XATTR |
2380 | .setxattr = shmem_setxattr, | 2787 | .setxattr = shmem_setxattr, |
2381 | .getxattr = shmem_getxattr, | 2788 | .getxattr = shmem_getxattr, |