diff options
author | Hugh Dickins <hughd@google.com> | 2011-05-11 18:13:37 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-11 21:50:45 -0400 |
commit | 778dd893ae785c5fd505dac30b5fc40aae188bf1 (patch) | |
tree | e9ff9c1efa2105740b5be0c368bfbc89ee85a01b /mm | |
parent | b1dea800ac39599301d4bb8dcf2b1d29c2558211 (diff) |
tmpfs: fix race between umount and swapoff
The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable
to umount as that in shmem_writepage().
Fix this instance by extending the protection of shmem_swaplist_mutex
right across shmem_unuse_inode(): while it's on the list, the inode cannot
be evicted (and the filesystem cannot be unmounted) without
shmem_evict_inode() taking that mutex to remove it from the list.
But since shmem_writepage() might take that mutex, we should avoid making
memory allocations or memcg charges while holding it: prepare them at the
outer level in shmem_unuse(). When mem_cgroup_cache_charge() was
originally placed, we didn't know until that point that the page from swap
was actually a shmem page; but nowadays it's noted in the swap_map, so
we're safe to charge upfront. For the radix_tree, do as is done in
shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a
habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves
on occasion if subsequently preempted.
With the allocation and charge moved out from shmem_unuse_inode(),
we can also hold index map and info->lock over from finding the entry.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/shmem.c | 88 |
1 files changed, 43 insertions, 45 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 262d71173447..dc17551d060a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -852,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_ | |||
852 | 852 | ||
853 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | 853 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) |
854 | { | 854 | { |
855 | struct inode *inode; | 855 | struct address_space *mapping; |
856 | unsigned long idx; | 856 | unsigned long idx; |
857 | unsigned long size; | 857 | unsigned long size; |
858 | unsigned long limit; | 858 | unsigned long limit; |
@@ -875,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
875 | if (size > SHMEM_NR_DIRECT) | 875 | if (size > SHMEM_NR_DIRECT) |
876 | size = SHMEM_NR_DIRECT; | 876 | size = SHMEM_NR_DIRECT; |
877 | offset = shmem_find_swp(entry, ptr, ptr+size); | 877 | offset = shmem_find_swp(entry, ptr, ptr+size); |
878 | if (offset >= 0) | 878 | if (offset >= 0) { |
879 | shmem_swp_balance_unmap(); | ||
879 | goto found; | 880 | goto found; |
881 | } | ||
880 | if (!info->i_indirect) | 882 | if (!info->i_indirect) |
881 | goto lost2; | 883 | goto lost2; |
882 | 884 | ||
@@ -914,11 +916,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
914 | if (size > ENTRIES_PER_PAGE) | 916 | if (size > ENTRIES_PER_PAGE) |
915 | size = ENTRIES_PER_PAGE; | 917 | size = ENTRIES_PER_PAGE; |
916 | offset = shmem_find_swp(entry, ptr, ptr+size); | 918 | offset = shmem_find_swp(entry, ptr, ptr+size); |
917 | shmem_swp_unmap(ptr); | ||
918 | if (offset >= 0) { | 919 | if (offset >= 0) { |
919 | shmem_dir_unmap(dir); | 920 | shmem_dir_unmap(dir); |
920 | goto found; | 921 | goto found; |
921 | } | 922 | } |
923 | shmem_swp_unmap(ptr); | ||
922 | } | 924 | } |
923 | } | 925 | } |
924 | lost1: | 926 | lost1: |
@@ -928,8 +930,7 @@ lost2: | |||
928 | return 0; | 930 | return 0; |
929 | found: | 931 | found: |
930 | idx += offset; | 932 | idx += offset; |
931 | inode = igrab(&info->vfs_inode); | 933 | ptr += offset; |
932 | spin_unlock(&info->lock); | ||
933 | 934 | ||
934 | /* | 935 | /* |
935 | * Move _head_ to start search for next from here. | 936 | * Move _head_ to start search for next from here. |
@@ -940,37 +941,18 @@ found: | |||
940 | */ | 941 | */ |
941 | if (shmem_swaplist.next != &info->swaplist) | 942 | if (shmem_swaplist.next != &info->swaplist) |
942 | list_move_tail(&shmem_swaplist, &info->swaplist); | 943 | list_move_tail(&shmem_swaplist, &info->swaplist); |
943 | mutex_unlock(&shmem_swaplist_mutex); | ||
944 | 944 | ||
945 | error = 1; | ||
946 | if (!inode) | ||
947 | goto out; | ||
948 | /* | 945 | /* |
949 | * Charge page using GFP_KERNEL while we can wait. | 946 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
950 | * Charged back to the user(not to caller) when swap account is used. | 947 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
951 | * add_to_page_cache() will be called with GFP_NOWAIT. | 948 | * beneath us (pagelock doesn't help until the page is in pagecache). |
952 | */ | 949 | */ |
953 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 950 | mapping = info->vfs_inode.i_mapping; |
954 | if (error) | 951 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); |
955 | goto out; | 952 | /* which does mem_cgroup_uncharge_cache_page on error */ |
956 | error = radix_tree_preload(GFP_KERNEL); | ||
957 | if (error) { | ||
958 | mem_cgroup_uncharge_cache_page(page); | ||
959 | goto out; | ||
960 | } | ||
961 | error = 1; | ||
962 | |||
963 | spin_lock(&info->lock); | ||
964 | ptr = shmem_swp_entry(info, idx, NULL); | ||
965 | if (ptr && ptr->val == entry.val) { | ||
966 | error = add_to_page_cache_locked(page, inode->i_mapping, | ||
967 | idx, GFP_NOWAIT); | ||
968 | /* does mem_cgroup_uncharge_cache_page on error */ | ||
969 | } else /* we must compensate for our precharge above */ | ||
970 | mem_cgroup_uncharge_cache_page(page); | ||
971 | 953 | ||
972 | if (error == -EEXIST) { | 954 | if (error == -EEXIST) { |
973 | struct page *filepage = find_get_page(inode->i_mapping, idx); | 955 | struct page *filepage = find_get_page(mapping, idx); |
974 | error = 1; | 956 | error = 1; |
975 | if (filepage) { | 957 | if (filepage) { |
976 | /* | 958 | /* |
@@ -990,14 +972,8 @@ found: | |||
990 | swap_free(entry); | 972 | swap_free(entry); |
991 | error = 1; /* not an error, but entry was found */ | 973 | error = 1; /* not an error, but entry was found */ |
992 | } | 974 | } |
993 | if (ptr) | 975 | shmem_swp_unmap(ptr); |
994 | shmem_swp_unmap(ptr); | ||
995 | spin_unlock(&info->lock); | 976 | spin_unlock(&info->lock); |
996 | radix_tree_preload_end(); | ||
997 | out: | ||
998 | unlock_page(page); | ||
999 | page_cache_release(page); | ||
1000 | iput(inode); /* allows for NULL */ | ||
1001 | return error; | 977 | return error; |
1002 | } | 978 | } |
1003 | 979 | ||
@@ -1009,6 +985,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1009 | struct list_head *p, *next; | 985 | struct list_head *p, *next; |
1010 | struct shmem_inode_info *info; | 986 | struct shmem_inode_info *info; |
1011 | int found = 0; | 987 | int found = 0; |
988 | int error; | ||
989 | |||
990 | /* | ||
991 | * Charge page using GFP_KERNEL while we can wait, before taking | ||
992 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | ||
993 | * Charged back to the user (not to caller) when swap account is used. | ||
994 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
995 | */ | ||
996 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | ||
997 | if (error) | ||
998 | goto out; | ||
999 | /* | ||
1000 | * Try to preload while we can wait, to not make a habit of | ||
1001 | * draining atomic reserves; but don't latch on to this cpu, | ||
1002 | * it's okay if sometimes we get rescheduled after this. | ||
1003 | */ | ||
1004 | error = radix_tree_preload(GFP_KERNEL); | ||
1005 | if (error) | ||
1006 | goto uncharge; | ||
1007 | radix_tree_preload_end(); | ||
1012 | 1008 | ||
1013 | mutex_lock(&shmem_swaplist_mutex); | 1009 | mutex_lock(&shmem_swaplist_mutex); |
1014 | list_for_each_safe(p, next, &shmem_swaplist) { | 1010 | list_for_each_safe(p, next, &shmem_swaplist) { |
@@ -1016,17 +1012,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1016 | found = shmem_unuse_inode(info, entry, page); | 1012 | found = shmem_unuse_inode(info, entry, page); |
1017 | cond_resched(); | 1013 | cond_resched(); |
1018 | if (found) | 1014 | if (found) |
1019 | goto out; | 1015 | break; |
1020 | } | 1016 | } |
1021 | mutex_unlock(&shmem_swaplist_mutex); | 1017 | mutex_unlock(&shmem_swaplist_mutex); |
1022 | /* | 1018 | |
1023 | * Can some race bring us here? We've been holding page lock, | 1019 | uncharge: |
1024 | * so I think not; but would rather try again later than BUG() | 1020 | if (!found) |
1025 | */ | 1021 | mem_cgroup_uncharge_cache_page(page); |
1022 | if (found < 0) | ||
1023 | error = found; | ||
1024 | out: | ||
1026 | unlock_page(page); | 1025 | unlock_page(page); |
1027 | page_cache_release(page); | 1026 | page_cache_release(page); |
1028 | out: | 1027 | return error; |
1029 | return (found < 0) ? found : 0; | ||
1030 | } | 1028 | } |
1031 | 1029 | ||
1032 | /* | 1030 | /* |