diff options
author | Hugh Dickins <hughd@google.com> | 2012-05-29 18:06:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-29 19:22:22 -0400 |
commit | bde05d1ccd512696b09db9dd2e5f33ad19152605 (patch) | |
tree | affa2c836136cac6ec0e503ce8996670d385ebbb | |
parent | 5ceb9ce6fe9462a298bb2cd5c9f1ca6cb80a0199 (diff) |
shmem: replace page if mapping excludes its zone
The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the
backing RAM has to be below 4GB. Not a problem while the boards
supported only 4GB: but now Intel's D2700MUD boards support 8GB, and
their GMA3600 is managed by the GMA500 driver.
shmem/tmpfs has never pretended to support hardware restrictions on the
backing memory, but it might have appeared to do so before v3.1, and
even now it works fine until a page is swapped out then back in. When
read_cache_page_gfp() supplied a freshly allocated page for copy, that
compensated for whatever choice might have been made by earlier swapin
readahead; but swapoff was likely to destroy the illusion.
We'd like to continue to support GMA500, so now add a new
shmem_should_replace_page() check on the zone when about to move a page
from swapcache to filecache (in swapin and swapoff cases), with
shmem_replace_page() to allocate and substitute a suitable page (given
gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32).
This does involve a minor extension to mem_cgroup_replace_page_cache()
(the page may or may not have already been charged); and I've removed a
comment and call to mem_cgroup_uncharge_cache_page(), which in fact is
always a no-op while PageSwapCache.
Also removed optimization of an unlikely path in shmem_getpage_gfp(),
now that we need to check PageSwapCache more carefully (a racing caller
might already have made the copy). And at one point shmem_unuse_inode()
needs to use the hitherto private page_swapcount(), to guard against
racing with inode eviction.
It would make sense to extend shmem_should_replace_page(), to cover
cpuset and NUMA mempolicy restrictions too, but set that aside for now:
needs a cleanup of shmem mempolicy handling, and more testing, and ought
to handle swap faults in do_swap_page() as well as shmem.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Stephane Marchesin <marcheu@chromium.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Rob Clark <rob.clark@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/swap.h | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 17 | ||||
-rw-r--r-- | mm/shmem.c | 141 | ||||
-rw-r--r-- | mm/swapfile.c | 2 |
4 files changed, 142 insertions, 24 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index bc3073ce95cc..d965c4bfab3a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -351,6 +351,7 @@ extern int swap_type_of(dev_t, sector_t, struct block_device **); | |||
351 | extern unsigned int count_swap_pages(int, int); | 351 | extern unsigned int count_swap_pages(int, int); |
352 | extern sector_t map_swap_page(struct page *, struct block_device **); | 352 | extern sector_t map_swap_page(struct page *, struct block_device **); |
353 | extern sector_t swapdev_block(int, pgoff_t); | 353 | extern sector_t swapdev_block(int, pgoff_t); |
354 | extern int page_swapcount(struct page *); | ||
354 | extern int reuse_swap_page(struct page *); | 355 | extern int reuse_swap_page(struct page *); |
355 | extern int try_to_free_swap(struct page *); | 356 | extern int try_to_free_swap(struct page *); |
356 | struct backing_dev_info; | 357 | struct backing_dev_info; |
@@ -445,6 +446,11 @@ static inline void delete_from_swap_cache(struct page *page) | |||
445 | { | 446 | { |
446 | } | 447 | } |
447 | 448 | ||
449 | static inline int page_swapcount(struct page *page) | ||
450 | { | ||
451 | return 0; | ||
452 | } | ||
453 | |||
448 | #define reuse_swap_page(page) (page_mapcount(page) == 1) | 454 | #define reuse_swap_page(page) (page_mapcount(page) == 1) |
449 | 455 | ||
450 | static inline int try_to_free_swap(struct page *page) | 456 | static inline int try_to_free_swap(struct page *page) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f71219cc53e..d7ce417cae7c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -3373,7 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3373 | void mem_cgroup_replace_page_cache(struct page *oldpage, | 3373 | void mem_cgroup_replace_page_cache(struct page *oldpage, |
3374 | struct page *newpage) | 3374 | struct page *newpage) |
3375 | { | 3375 | { |
3376 | struct mem_cgroup *memcg; | 3376 | struct mem_cgroup *memcg = NULL; |
3377 | struct page_cgroup *pc; | 3377 | struct page_cgroup *pc; |
3378 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3378 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3379 | 3379 | ||
@@ -3383,11 +3383,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3383 | pc = lookup_page_cgroup(oldpage); | 3383 | pc = lookup_page_cgroup(oldpage); |
3384 | /* fix accounting on old pages */ | 3384 | /* fix accounting on old pages */ |
3385 | lock_page_cgroup(pc); | 3385 | lock_page_cgroup(pc); |
3386 | memcg = pc->mem_cgroup; | 3386 | if (PageCgroupUsed(pc)) { |
3387 | mem_cgroup_charge_statistics(memcg, false, -1); | 3387 | memcg = pc->mem_cgroup; |
3388 | ClearPageCgroupUsed(pc); | 3388 | mem_cgroup_charge_statistics(memcg, false, -1); |
3389 | ClearPageCgroupUsed(pc); | ||
3390 | } | ||
3389 | unlock_page_cgroup(pc); | 3391 | unlock_page_cgroup(pc); |
3390 | 3392 | ||
3393 | /* | ||
3394 | * When called from shmem_replace_page(), in some cases the | ||
3395 | * oldpage has already been charged, and in some cases not. | ||
3396 | */ | ||
3397 | if (!memcg) | ||
3398 | return; | ||
3399 | |||
3391 | if (PageSwapBacked(oldpage)) | 3400 | if (PageSwapBacked(oldpage)) |
3392 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3401 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3393 | 3402 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index be5af34a070d..db72d8e44ec6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -103,6 +103,9 @@ static unsigned long shmem_default_max_inodes(void) | |||
103 | } | 103 | } |
104 | #endif | 104 | #endif |
105 | 105 | ||
106 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | ||
107 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
108 | struct shmem_inode_info *info, pgoff_t index); | ||
106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 109 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); | 110 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
108 | 111 | ||
@@ -604,12 +607,13 @@ static void shmem_evict_inode(struct inode *inode) | |||
604 | * If swap found in inode, free it and move page from swapcache to filecache. | 607 | * If swap found in inode, free it and move page from swapcache to filecache. |
605 | */ | 608 | */ |
606 | static int shmem_unuse_inode(struct shmem_inode_info *info, | 609 | static int shmem_unuse_inode(struct shmem_inode_info *info, |
607 | swp_entry_t swap, struct page *page) | 610 | swp_entry_t swap, struct page **pagep) |
608 | { | 611 | { |
609 | struct address_space *mapping = info->vfs_inode.i_mapping; | 612 | struct address_space *mapping = info->vfs_inode.i_mapping; |
610 | void *radswap; | 613 | void *radswap; |
611 | pgoff_t index; | 614 | pgoff_t index; |
612 | int error; | 615 | gfp_t gfp; |
616 | int error = 0; | ||
613 | 617 | ||
614 | radswap = swp_to_radix_entry(swap); | 618 | radswap = swp_to_radix_entry(swap); |
615 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 619 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
@@ -625,22 +629,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
625 | if (shmem_swaplist.next != &info->swaplist) | 629 | if (shmem_swaplist.next != &info->swaplist) |
626 | list_move_tail(&shmem_swaplist, &info->swaplist); | 630 | list_move_tail(&shmem_swaplist, &info->swaplist); |
627 | 631 | ||
632 | gfp = mapping_gfp_mask(mapping); | ||
633 | if (shmem_should_replace_page(*pagep, gfp)) { | ||
634 | mutex_unlock(&shmem_swaplist_mutex); | ||
635 | error = shmem_replace_page(pagep, gfp, info, index); | ||
636 | mutex_lock(&shmem_swaplist_mutex); | ||
637 | /* | ||
638 | * We needed to drop mutex to make that restrictive page | ||
639 | * allocation; but the inode might already be freed by now, | ||
640 | * and we cannot refer to inode or mapping or info to check. | ||
641 | * However, we do hold page lock on the PageSwapCache page, | ||
642 | * so can check if that still has our reference remaining. | ||
643 | */ | ||
644 | if (!page_swapcount(*pagep)) | ||
645 | error = -ENOENT; | ||
646 | } | ||
647 | |||
628 | /* | 648 | /* |
629 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | 649 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
630 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 650 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
631 | * beneath us (pagelock doesn't help until the page is in pagecache). | 651 | * beneath us (pagelock doesn't help until the page is in pagecache). |
632 | */ | 652 | */ |
633 | error = shmem_add_to_page_cache(page, mapping, index, | 653 | if (!error) |
654 | error = shmem_add_to_page_cache(*pagep, mapping, index, | ||
634 | GFP_NOWAIT, radswap); | 655 | GFP_NOWAIT, radswap); |
635 | /* which does mem_cgroup_uncharge_cache_page on error */ | ||
636 | |||
637 | if (error != -ENOMEM) { | 656 | if (error != -ENOMEM) { |
638 | /* | 657 | /* |
639 | * Truncation and eviction use free_swap_and_cache(), which | 658 | * Truncation and eviction use free_swap_and_cache(), which |
640 | * only does trylock page: if we raced, best clean up here. | 659 | * only does trylock page: if we raced, best clean up here. |
641 | */ | 660 | */ |
642 | delete_from_swap_cache(page); | 661 | delete_from_swap_cache(*pagep); |
643 | set_page_dirty(page); | 662 | set_page_dirty(*pagep); |
644 | if (!error) { | 663 | if (!error) { |
645 | spin_lock(&info->lock); | 664 | spin_lock(&info->lock); |
646 | info->swapped--; | 665 | info->swapped--; |
@@ -660,7 +679,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
660 | struct list_head *this, *next; | 679 | struct list_head *this, *next; |
661 | struct shmem_inode_info *info; | 680 | struct shmem_inode_info *info; |
662 | int found = 0; | 681 | int found = 0; |
663 | int error; | 682 | int error = 0; |
683 | |||
684 | /* | ||
685 | * There's a faint possibility that swap page was replaced before | ||
686 | * caller locked it: it will come back later with the right page. | ||
687 | */ | ||
688 | if (unlikely(!PageSwapCache(page))) | ||
689 | goto out; | ||
664 | 690 | ||
665 | /* | 691 | /* |
666 | * Charge page using GFP_KERNEL while we can wait, before taking | 692 | * Charge page using GFP_KERNEL while we can wait, before taking |
@@ -676,7 +702,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
676 | list_for_each_safe(this, next, &shmem_swaplist) { | 702 | list_for_each_safe(this, next, &shmem_swaplist) { |
677 | info = list_entry(this, struct shmem_inode_info, swaplist); | 703 | info = list_entry(this, struct shmem_inode_info, swaplist); |
678 | if (info->swapped) | 704 | if (info->swapped) |
679 | found = shmem_unuse_inode(info, swap, page); | 705 | found = shmem_unuse_inode(info, swap, &page); |
680 | else | 706 | else |
681 | list_del_init(&info->swaplist); | 707 | list_del_init(&info->swaplist); |
682 | cond_resched(); | 708 | cond_resched(); |
@@ -685,8 +711,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
685 | } | 711 | } |
686 | mutex_unlock(&shmem_swaplist_mutex); | 712 | mutex_unlock(&shmem_swaplist_mutex); |
687 | 713 | ||
688 | if (!found) | ||
689 | mem_cgroup_uncharge_cache_page(page); | ||
690 | if (found < 0) | 714 | if (found < 0) |
691 | error = found; | 715 | error = found; |
692 | out: | 716 | out: |
@@ -856,6 +880,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
856 | #endif | 880 | #endif |
857 | 881 | ||
858 | /* | 882 | /* |
883 | * When a page is moved from swapcache to shmem filecache (either by the | ||
884 | * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of | ||
885 | * shmem_unuse_inode()), it may have been read in earlier from swap, in | ||
886 | * ignorance of the mapping it belongs to. If that mapping has special | ||
887 | * constraints (like the gma500 GEM driver, which requires RAM below 4GB), | ||
888 | * we may need to copy to a suitable page before moving to filecache. | ||
889 | * | ||
890 | * In a future release, this may well be extended to respect cpuset and | ||
891 | * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); | ||
892 | * but for now it is a simple matter of zone. | ||
893 | */ | ||
894 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp) | ||
895 | { | ||
896 | return page_zonenum(page) > gfp_zone(gfp); | ||
897 | } | ||
898 | |||
899 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
900 | struct shmem_inode_info *info, pgoff_t index) | ||
901 | { | ||
902 | struct page *oldpage, *newpage; | ||
903 | struct address_space *swap_mapping; | ||
904 | pgoff_t swap_index; | ||
905 | int error; | ||
906 | |||
907 | oldpage = *pagep; | ||
908 | swap_index = page_private(oldpage); | ||
909 | swap_mapping = page_mapping(oldpage); | ||
910 | |||
911 | /* | ||
912 | * We have arrived here because our zones are constrained, so don't | ||
913 | * limit chance of success by further cpuset and node constraints. | ||
914 | */ | ||
915 | gfp &= ~GFP_CONSTRAINT_MASK; | ||
916 | newpage = shmem_alloc_page(gfp, info, index); | ||
917 | if (!newpage) | ||
918 | return -ENOMEM; | ||
919 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
920 | |||
921 | *pagep = newpage; | ||
922 | page_cache_get(newpage); | ||
923 | copy_highpage(newpage, oldpage); | ||
924 | |||
925 | VM_BUG_ON(!PageLocked(oldpage)); | ||
926 | __set_page_locked(newpage); | ||
927 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
928 | SetPageUptodate(newpage); | ||
929 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
930 | SetPageSwapBacked(newpage); | ||
931 | VM_BUG_ON(!swap_index); | ||
932 | set_page_private(newpage, swap_index); | ||
933 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
934 | SetPageSwapCache(newpage); | ||
935 | |||
936 | /* | ||
937 | * Our caller will very soon move newpage out of swapcache, but it's | ||
938 | * a nice clean interface for us to replace oldpage by newpage there. | ||
939 | */ | ||
940 | spin_lock_irq(&swap_mapping->tree_lock); | ||
941 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | ||
942 | newpage); | ||
943 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | ||
944 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
945 | spin_unlock_irq(&swap_mapping->tree_lock); | ||
946 | BUG_ON(error); | ||
947 | |||
948 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
949 | lru_cache_add_anon(newpage); | ||
950 | |||
951 | ClearPageSwapCache(oldpage); | ||
952 | set_page_private(oldpage, 0); | ||
953 | |||
954 | unlock_page(oldpage); | ||
955 | page_cache_release(oldpage); | ||
956 | page_cache_release(oldpage); | ||
957 | return 0; | ||
958 | } | ||
959 | |||
960 | /* | ||
859 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate | 961 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
860 | * | 962 | * |
861 | * If we allocate a new one we do not mark it dirty. That's up to the | 963 | * If we allocate a new one we do not mark it dirty. That's up to the |
@@ -923,19 +1025,20 @@ repeat: | |||
923 | 1025 | ||
924 | /* We have to do this with page locked to prevent races */ | 1026 | /* We have to do this with page locked to prevent races */ |
925 | lock_page(page); | 1027 | lock_page(page); |
1028 | if (!PageSwapCache(page) || page->mapping) { | ||
1029 | error = -EEXIST; /* try again */ | ||
1030 | goto failed; | ||
1031 | } | ||
926 | if (!PageUptodate(page)) { | 1032 | if (!PageUptodate(page)) { |
927 | error = -EIO; | 1033 | error = -EIO; |
928 | goto failed; | 1034 | goto failed; |
929 | } | 1035 | } |
930 | wait_on_page_writeback(page); | 1036 | wait_on_page_writeback(page); |
931 | 1037 | ||
932 | /* Someone may have already done it for us */ | 1038 | if (shmem_should_replace_page(page, gfp)) { |
933 | if (page->mapping) { | 1039 | error = shmem_replace_page(&page, gfp, info, index); |
934 | if (page->mapping == mapping && | 1040 | if (error) |
935 | page->index == index) | 1041 | goto failed; |
936 | goto done; | ||
937 | error = -EEXIST; | ||
938 | goto failed; | ||
939 | } | 1042 | } |
940 | 1043 | ||
941 | error = mem_cgroup_cache_charge(page, current->mm, | 1044 | error = mem_cgroup_cache_charge(page, current->mm, |
@@ -998,7 +1101,7 @@ repeat: | |||
998 | if (sgp == SGP_DIRTY) | 1101 | if (sgp == SGP_DIRTY) |
999 | set_page_dirty(page); | 1102 | set_page_dirty(page); |
1000 | } | 1103 | } |
1001 | done: | 1104 | |
1002 | /* Perhaps the file has been truncated since we checked */ | 1105 | /* Perhaps the file has been truncated since we checked */ |
1003 | if (sgp != SGP_WRITE && | 1106 | if (sgp != SGP_WRITE && |
1004 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1107 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..b0c86e92f42c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
601 | * This does not give an exact answer when swap count is continued, | 601 | * This does not give an exact answer when swap count is continued, |
602 | * but does include the high COUNT_CONTINUED flag to allow for that. | 602 | * but does include the high COUNT_CONTINUED flag to allow for that. |
603 | */ | 603 | */ |
604 | static inline int page_swapcount(struct page *page) | 604 | int page_swapcount(struct page *page) |
605 | { | 605 | { |
606 | int count = 0; | 606 | int count = 0; |
607 | struct swap_info_struct *p; | 607 | struct swap_info_struct *p; |