diff options
author | Hugh Dickins <hughd@google.com> | 2011-08-03 19:21:24 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-08-03 20:25:23 -0400 |
commit | 54af60421822bb9cb664dd5cd7aac46c01ccfcf8 (patch) | |
tree | b62d2b9ccc2b467ee5406bb0d3494db6a5f96582 /mm/shmem.c | |
parent | 46f65ec15c6878a2b4a49f6e01b20b201b46a9e4 (diff) |
tmpfs: convert shmem_getpage_gfp to radix-swap
Convert shmem_getpage_gfp(), the engine-room of shmem, to expect page or
swap entry returned from radix tree by find_lock_page().
Whereas the repetitive old method proceeded mainly under info->lock,
dropping and repeating whenever one of the conditions needed was not
met, now we can proceed without it, leaving shmem_add_to_page_cache() to
check for a race.
This way there is no need to preallocate a page, no need for an early
radix_tree_preload(), no need for mem_cgroup_shmem_charge_fallback().
Move the error unwinding down to the bottom instead of repeating it
throughout. ENOSPC handling is a little different from before: there is
no longer any race between find_lock_page() and finding swap, but we can
arrive at ENOSPC before calling shmem_recalc_inode(), which might
occasionally discover freed space.
Be stricter to check i_size before returning. info->lock is used for
little but alloced, swapped, i_blocks updates. Move i_blocks updates
out from under the max_blocks check, so even an unlimited size=0 mount
can show accurate du.
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 259 |
1 files changed, 112 insertions, 147 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 174f97188e8a..92f01d7cc150 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -166,15 +166,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | |||
166 | static LIST_HEAD(shmem_swaplist); | 166 | static LIST_HEAD(shmem_swaplist); |
167 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 167 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
168 | 168 | ||
169 | static void shmem_free_blocks(struct inode *inode, long pages) | ||
170 | { | ||
171 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
172 | if (sbinfo->max_blocks) { | ||
173 | percpu_counter_add(&sbinfo->used_blocks, -pages); | ||
174 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | ||
175 | } | ||
176 | } | ||
177 | |||
178 | static int shmem_reserve_inode(struct super_block *sb) | 169 | static int shmem_reserve_inode(struct super_block *sb) |
179 | { | 170 | { |
180 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 171 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
@@ -219,9 +210,12 @@ static void shmem_recalc_inode(struct inode *inode) | |||
219 | 210 | ||
220 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | 211 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; |
221 | if (freed > 0) { | 212 | if (freed > 0) { |
213 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
214 | if (sbinfo->max_blocks) | ||
215 | percpu_counter_add(&sbinfo->used_blocks, -freed); | ||
222 | info->alloced -= freed; | 216 | info->alloced -= freed; |
217 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; | ||
223 | shmem_unacct_blocks(info->flags, freed); | 218 | shmem_unacct_blocks(info->flags, freed); |
224 | shmem_free_blocks(inode, freed); | ||
225 | } | 219 | } |
226 | } | 220 | } |
227 | 221 | ||
@@ -888,205 +882,180 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
888 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) | 882 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
889 | { | 883 | { |
890 | struct address_space *mapping = inode->i_mapping; | 884 | struct address_space *mapping = inode->i_mapping; |
891 | struct shmem_inode_info *info = SHMEM_I(inode); | 885 | struct shmem_inode_info *info; |
892 | struct shmem_sb_info *sbinfo; | 886 | struct shmem_sb_info *sbinfo; |
893 | struct page *page; | 887 | struct page *page; |
894 | struct page *prealloc_page = NULL; | ||
895 | swp_entry_t swap; | 888 | swp_entry_t swap; |
896 | int error; | 889 | int error; |
890 | int once = 0; | ||
897 | 891 | ||
898 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) | 892 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
899 | return -EFBIG; | 893 | return -EFBIG; |
900 | repeat: | 894 | repeat: |
895 | swap.val = 0; | ||
901 | page = find_lock_page(mapping, index); | 896 | page = find_lock_page(mapping, index); |
902 | if (page) { | 897 | if (radix_tree_exceptional_entry(page)) { |
898 | swap = radix_to_swp_entry(page); | ||
899 | page = NULL; | ||
900 | } | ||
901 | |||
902 | if (sgp != SGP_WRITE && | ||
903 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
904 | error = -EINVAL; | ||
905 | goto failed; | ||
906 | } | ||
907 | |||
908 | if (page || (sgp == SGP_READ && !swap.val)) { | ||
903 | /* | 909 | /* |
904 | * Once we can get the page lock, it must be uptodate: | 910 | * Once we can get the page lock, it must be uptodate: |
905 | * if there were an error in reading back from swap, | 911 | * if there were an error in reading back from swap, |
906 | * the page would not be inserted into the filecache. | 912 | * the page would not be inserted into the filecache. |
907 | */ | 913 | */ |
908 | BUG_ON(!PageUptodate(page)); | 914 | BUG_ON(page && !PageUptodate(page)); |
909 | goto done; | 915 | *pagep = page; |
916 | return 0; | ||
910 | } | 917 | } |
911 | 918 | ||
912 | /* | 919 | /* |
913 | * Try to preload while we can wait, to not make a habit of | 920 | * Fast cache lookup did not find it: |
914 | * draining atomic reserves; but don't latch on to this cpu. | 921 | * bring it back from swap or allocate. |
915 | */ | 922 | */ |
916 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 923 | info = SHMEM_I(inode); |
917 | if (error) | 924 | sbinfo = SHMEM_SB(inode->i_sb); |
918 | goto out; | ||
919 | radix_tree_preload_end(); | ||
920 | |||
921 | if (sgp != SGP_READ && !prealloc_page) { | ||
922 | prealloc_page = shmem_alloc_page(gfp, info, index); | ||
923 | if (prealloc_page) { | ||
924 | SetPageSwapBacked(prealloc_page); | ||
925 | if (mem_cgroup_cache_charge(prealloc_page, | ||
926 | current->mm, GFP_KERNEL)) { | ||
927 | page_cache_release(prealloc_page); | ||
928 | prealloc_page = NULL; | ||
929 | } | ||
930 | } | ||
931 | } | ||
932 | 925 | ||
933 | spin_lock(&info->lock); | ||
934 | shmem_recalc_inode(inode); | ||
935 | swap = shmem_get_swap(info, index); | ||
936 | if (swap.val) { | 926 | if (swap.val) { |
937 | /* Look it up and read it in.. */ | 927 | /* Look it up and read it in.. */ |
938 | page = lookup_swap_cache(swap); | 928 | page = lookup_swap_cache(swap); |
939 | if (!page) { | 929 | if (!page) { |
940 | spin_unlock(&info->lock); | ||
941 | /* here we actually do the io */ | 930 | /* here we actually do the io */ |
942 | if (fault_type) | 931 | if (fault_type) |
943 | *fault_type |= VM_FAULT_MAJOR; | 932 | *fault_type |= VM_FAULT_MAJOR; |
944 | page = shmem_swapin(swap, gfp, info, index); | 933 | page = shmem_swapin(swap, gfp, info, index); |
945 | if (!page) { | 934 | if (!page) { |
946 | swp_entry_t nswap = shmem_get_swap(info, index); | 935 | error = -ENOMEM; |
947 | if (nswap.val == swap.val) { | 936 | goto failed; |
948 | error = -ENOMEM; | ||
949 | goto out; | ||
950 | } | ||
951 | goto repeat; | ||
952 | } | 937 | } |
953 | wait_on_page_locked(page); | ||
954 | page_cache_release(page); | ||
955 | goto repeat; | ||
956 | } | 938 | } |
957 | 939 | ||
958 | /* We have to do this with page locked to prevent races */ | 940 | /* We have to do this with page locked to prevent races */ |
959 | if (!trylock_page(page)) { | 941 | lock_page(page); |
960 | spin_unlock(&info->lock); | ||
961 | wait_on_page_locked(page); | ||
962 | page_cache_release(page); | ||
963 | goto repeat; | ||
964 | } | ||
965 | if (PageWriteback(page)) { | ||
966 | spin_unlock(&info->lock); | ||
967 | wait_on_page_writeback(page); | ||
968 | unlock_page(page); | ||
969 | page_cache_release(page); | ||
970 | goto repeat; | ||
971 | } | ||
972 | if (!PageUptodate(page)) { | 942 | if (!PageUptodate(page)) { |
973 | spin_unlock(&info->lock); | ||
974 | unlock_page(page); | ||
975 | page_cache_release(page); | ||
976 | error = -EIO; | 943 | error = -EIO; |
977 | goto out; | 944 | goto failed; |
978 | } | 945 | } |
979 | 946 | wait_on_page_writeback(page); | |
980 | error = add_to_page_cache_locked(page, mapping, | 947 | |
981 | index, GFP_NOWAIT); | 948 | /* Someone may have already done it for us */ |
982 | if (error) { | 949 | if (page->mapping) { |
983 | spin_unlock(&info->lock); | 950 | if (page->mapping == mapping && |
984 | if (error == -ENOMEM) { | 951 | page->index == index) |
985 | /* | 952 | goto done; |
986 | * reclaim from proper memory cgroup and | 953 | error = -EEXIST; |
987 | * call memcg's OOM if needed. | 954 | goto failed; |
988 | */ | ||
989 | error = mem_cgroup_shmem_charge_fallback( | ||
990 | page, current->mm, gfp); | ||
991 | if (error) { | ||
992 | unlock_page(page); | ||
993 | page_cache_release(page); | ||
994 | goto out; | ||
995 | } | ||
996 | } | ||
997 | unlock_page(page); | ||
998 | page_cache_release(page); | ||
999 | goto repeat; | ||
1000 | } | 955 | } |
1001 | 956 | ||
1002 | delete_from_swap_cache(page); | 957 | error = shmem_add_to_page_cache(page, mapping, index, |
1003 | shmem_put_swap(info, index, (swp_entry_t){0}); | 958 | gfp, swp_to_radix_entry(swap)); |
959 | if (error) | ||
960 | goto failed; | ||
961 | |||
962 | spin_lock(&info->lock); | ||
1004 | info->swapped--; | 963 | info->swapped--; |
964 | shmem_recalc_inode(inode); | ||
1005 | spin_unlock(&info->lock); | 965 | spin_unlock(&info->lock); |
966 | |||
967 | delete_from_swap_cache(page); | ||
1006 | set_page_dirty(page); | 968 | set_page_dirty(page); |
1007 | swap_free(swap); | 969 | swap_free(swap); |
1008 | 970 | ||
1009 | } else if (sgp == SGP_READ) { | 971 | } else { |
1010 | page = find_get_page(mapping, index); | 972 | if (shmem_acct_block(info->flags)) { |
1011 | if (page && !trylock_page(page)) { | 973 | error = -ENOSPC; |
1012 | spin_unlock(&info->lock); | 974 | goto failed; |
1013 | wait_on_page_locked(page); | ||
1014 | page_cache_release(page); | ||
1015 | goto repeat; | ||
1016 | } | 975 | } |
1017 | spin_unlock(&info->lock); | ||
1018 | |||
1019 | } else if (prealloc_page) { | ||
1020 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1021 | if (sbinfo->max_blocks) { | 976 | if (sbinfo->max_blocks) { |
1022 | if (percpu_counter_compare(&sbinfo->used_blocks, | 977 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1023 | sbinfo->max_blocks) >= 0 || | 978 | sbinfo->max_blocks) >= 0) { |
1024 | shmem_acct_block(info->flags)) | 979 | error = -ENOSPC; |
1025 | goto nospace; | 980 | goto unacct; |
981 | } | ||
1026 | percpu_counter_inc(&sbinfo->used_blocks); | 982 | percpu_counter_inc(&sbinfo->used_blocks); |
1027 | inode->i_blocks += BLOCKS_PER_PAGE; | 983 | } |
1028 | } else if (shmem_acct_block(info->flags)) | ||
1029 | goto nospace; | ||
1030 | |||
1031 | page = prealloc_page; | ||
1032 | prealloc_page = NULL; | ||
1033 | 984 | ||
1034 | swap = shmem_get_swap(info, index); | 985 | page = shmem_alloc_page(gfp, info, index); |
1035 | if (swap.val) | 986 | if (!page) { |
1036 | mem_cgroup_uncharge_cache_page(page); | 987 | error = -ENOMEM; |
1037 | else | 988 | goto decused; |
1038 | error = add_to_page_cache_lru(page, mapping, | ||
1039 | index, GFP_NOWAIT); | ||
1040 | /* | ||
1041 | * At add_to_page_cache_lru() failure, | ||
1042 | * uncharge will be done automatically. | ||
1043 | */ | ||
1044 | if (swap.val || error) { | ||
1045 | shmem_unacct_blocks(info->flags, 1); | ||
1046 | shmem_free_blocks(inode, 1); | ||
1047 | spin_unlock(&info->lock); | ||
1048 | page_cache_release(page); | ||
1049 | goto repeat; | ||
1050 | } | 989 | } |
1051 | 990 | ||
991 | SetPageSwapBacked(page); | ||
992 | __set_page_locked(page); | ||
993 | error = shmem_add_to_page_cache(page, mapping, index, | ||
994 | gfp, NULL); | ||
995 | if (error) | ||
996 | goto decused; | ||
997 | lru_cache_add_anon(page); | ||
998 | |||
999 | spin_lock(&info->lock); | ||
1052 | info->alloced++; | 1000 | info->alloced++; |
1001 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
1002 | shmem_recalc_inode(inode); | ||
1053 | spin_unlock(&info->lock); | 1003 | spin_unlock(&info->lock); |
1004 | |||
1054 | clear_highpage(page); | 1005 | clear_highpage(page); |
1055 | flush_dcache_page(page); | 1006 | flush_dcache_page(page); |
1056 | SetPageUptodate(page); | 1007 | SetPageUptodate(page); |
1057 | if (sgp == SGP_DIRTY) | 1008 | if (sgp == SGP_DIRTY) |
1058 | set_page_dirty(page); | 1009 | set_page_dirty(page); |
1059 | |||
1060 | } else { | ||
1061 | spin_unlock(&info->lock); | ||
1062 | error = -ENOMEM; | ||
1063 | goto out; | ||
1064 | } | 1010 | } |
1065 | done: | 1011 | done: |
1066 | *pagep = page; | 1012 | /* Perhaps the file has been truncated since we checked */ |
1067 | error = 0; | 1013 | if (sgp != SGP_WRITE && |
1068 | out: | 1014 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1069 | if (prealloc_page) { | 1015 | error = -EINVAL; |
1070 | mem_cgroup_uncharge_cache_page(prealloc_page); | 1016 | goto trunc; |
1071 | page_cache_release(prealloc_page); | ||
1072 | } | 1017 | } |
1073 | return error; | 1018 | *pagep = page; |
1019 | return 0; | ||
1074 | 1020 | ||
1075 | nospace: | ||
1076 | /* | 1021 | /* |
1077 | * Perhaps the page was brought in from swap between find_lock_page | 1022 | * Error recovery. |
1078 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | ||
1079 | * but must also avoid reporting a spurious ENOSPC while working on a | ||
1080 | * full tmpfs. | ||
1081 | */ | 1023 | */ |
1082 | page = find_get_page(mapping, index); | 1024 | trunc: |
1025 | ClearPageDirty(page); | ||
1026 | delete_from_page_cache(page); | ||
1027 | spin_lock(&info->lock); | ||
1028 | info->alloced--; | ||
1029 | inode->i_blocks -= BLOCKS_PER_PAGE; | ||
1083 | spin_unlock(&info->lock); | 1030 | spin_unlock(&info->lock); |
1031 | decused: | ||
1032 | if (sbinfo->max_blocks) | ||
1033 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
1034 | unacct: | ||
1035 | shmem_unacct_blocks(info->flags, 1); | ||
1036 | failed: | ||
1037 | if (swap.val && error != -EINVAL) { | ||
1038 | struct page *test = find_get_page(mapping, index); | ||
1039 | if (test && !radix_tree_exceptional_entry(test)) | ||
1040 | page_cache_release(test); | ||
1041 | /* Have another try if the entry has changed */ | ||
1042 | if (test != swp_to_radix_entry(swap)) | ||
1043 | error = -EEXIST; | ||
1044 | } | ||
1084 | if (page) { | 1045 | if (page) { |
1046 | unlock_page(page); | ||
1085 | page_cache_release(page); | 1047 | page_cache_release(page); |
1048 | } | ||
1049 | if (error == -ENOSPC && !once++) { | ||
1050 | info = SHMEM_I(inode); | ||
1051 | spin_lock(&info->lock); | ||
1052 | shmem_recalc_inode(inode); | ||
1053 | spin_unlock(&info->lock); | ||
1086 | goto repeat; | 1054 | goto repeat; |
1087 | } | 1055 | } |
1088 | error = -ENOSPC; | 1056 | if (error == -EEXIST) |
1089 | goto out; | 1057 | goto repeat; |
1058 | return error; | ||
1090 | } | 1059 | } |
1091 | 1060 | ||
1092 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1061 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
@@ -1095,9 +1064,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1095 | int error; | 1064 | int error; |
1096 | int ret = VM_FAULT_LOCKED; | 1065 | int ret = VM_FAULT_LOCKED; |
1097 | 1066 | ||
1098 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
1099 | return VM_FAULT_SIGBUS; | ||
1100 | |||
1101 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1067 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1102 | if (error) | 1068 | if (error) |
1103 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1069 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
@@ -2164,8 +2130,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2164 | if (config.max_inodes < inodes) | 2130 | if (config.max_inodes < inodes) |
2165 | goto out; | 2131 | goto out; |
2166 | /* | 2132 | /* |
2167 | * Those tests also disallow limited->unlimited while any are in | 2133 | * Those tests disallow limited->unlimited while any are in use; |
2168 | * use, so i_blocks will always be zero when max_blocks is zero; | ||
2169 | * but we must separately disallow unlimited->limited, because | 2134 | * but we must separately disallow unlimited->limited, because |
2170 | * in that case we have no record of how much is already in use. | 2135 | * in that case we have no record of how much is already in use. |
2171 | */ | 2136 | */ |