summaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2019-04-18 20:50:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-04-19 12:46:04 -0400
commitaf53d3e9e04024885de5b4fda51e5fa362ae2bd8 (patch)
treeab2a56b0bafc76d99dcd2924f22f0fed76f4ae2e /mm/shmem.c
parent64165b1affc5bc16231ac971e66aae7d68d57f2c (diff)
mm: swapoff: shmem_unuse() stop eviction without igrab()
The igrab() in shmem_unuse() looks good, but we forgot that it gives no protection against concurrent unmounting: a point made by Konstantin Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78 ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day..." followed by GPF. Once again, give up on using igrab(); but don't go back to making such heavy-handed use of shmem_swaplist_mutex as last time: that would spoil the new design, and I expect could deadlock inside shmem_swapin_page(). Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem- specific inode, and shmem_evict_inode() wait for that to go down to 0. Call it "stop_eviction" rather than "swapoff_busy" because it can be put to use for others later (huge tmpfs patches expect to use it). That simplifies shmem_unuse(), protecting it from both unlink and unmount; and in practice lets it locate all the swap in its first try. But do not rely on that: there's still a theoretical case, when shmem_writepage() might have been preempted after its get_swap_page(), before making the swap entry visible to swapoff. [hughd@google.com: remove incorrect list_del()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Hugh Dickins <hughd@google.com> Cc: "Alex Xu (Hello71)" <alex_y_xu@yahoo.ca> Cc: Huang Ying <ying.huang@intel.com> Cc: Kelley Nielsen <kelleynnn@gmail.com> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: Rik van Riel <riel@surriel.com> Cc: Vineeth Pillai <vpillai@digitalocean.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c40
1 files changed, 18 insertions, 22 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 859e8628071f..2275a0ff7c30 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode)
1081 } 1081 }
1082 spin_unlock(&sbinfo->shrinklist_lock); 1082 spin_unlock(&sbinfo->shrinklist_lock);
1083 } 1083 }
1084 if (!list_empty(&info->swaplist)) { 1084 while (!list_empty(&info->swaplist)) {
1085 /* Wait while shmem_unuse() is scanning this inode... */
1086 wait_var_event(&info->stop_eviction,
1087 !atomic_read(&info->stop_eviction));
1085 mutex_lock(&shmem_swaplist_mutex); 1088 mutex_lock(&shmem_swaplist_mutex);
1086 list_del_init(&info->swaplist); 1089 /* ...but beware of the race if we peeked too early */
1090 if (!atomic_read(&info->stop_eviction))
1091 list_del_init(&info->swaplist);
1087 mutex_unlock(&shmem_swaplist_mutex); 1092 mutex_unlock(&shmem_swaplist_mutex);
1088 } 1093 }
1089 } 1094 }
@@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap,
1227 unsigned long *fs_pages_to_unuse) 1232 unsigned long *fs_pages_to_unuse)
1228{ 1233{
1229 struct shmem_inode_info *info, *next; 1234 struct shmem_inode_info *info, *next;
1230 struct inode *inode;
1231 struct inode *prev_inode = NULL;
1232 int error = 0; 1235 int error = 0;
1233 1236
1234 if (list_empty(&shmem_swaplist)) 1237 if (list_empty(&shmem_swaplist))
1235 return 0; 1238 return 0;
1236 1239
1237 mutex_lock(&shmem_swaplist_mutex); 1240 mutex_lock(&shmem_swaplist_mutex);
1238
1239 /*
1240 * The extra refcount on the inode is necessary to safely dereference
1241 * p->next after re-acquiring the lock. New shmem inodes with swap
1242 * get added to the end of the list and we will scan them all.
1243 */
1244 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1241 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1245 if (!info->swapped) { 1242 if (!info->swapped) {
1246 list_del_init(&info->swaplist); 1243 list_del_init(&info->swaplist);
1247 continue; 1244 continue;
1248 } 1245 }
1249 1246 /*
1250 inode = igrab(&info->vfs_inode); 1247 * Drop the swaplist mutex while searching the inode for swap;
1251 if (!inode) 1248 * but before doing so, make sure shmem_evict_inode() will not
1252 continue; 1249 * remove placeholder inode from swaplist, nor let it be freed
1253 1250 * (igrab() would protect from unlink, but not from unmount).
1251 */
1252 atomic_inc(&info->stop_eviction);
1254 mutex_unlock(&shmem_swaplist_mutex); 1253 mutex_unlock(&shmem_swaplist_mutex);
1255 if (prev_inode)
1256 iput(prev_inode);
1257 prev_inode = inode;
1258 1254
1259 error = shmem_unuse_inode(inode, type, frontswap, 1255 error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1260 fs_pages_to_unuse); 1256 fs_pages_to_unuse);
1261 cond_resched(); 1257 cond_resched();
1262 1258
@@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap,
1264 next = list_next_entry(info, swaplist); 1260 next = list_next_entry(info, swaplist);
1265 if (!info->swapped) 1261 if (!info->swapped)
1266 list_del_init(&info->swaplist); 1262 list_del_init(&info->swaplist);
1263 if (atomic_dec_and_test(&info->stop_eviction))
1264 wake_up_var(&info->stop_eviction);
1267 if (error) 1265 if (error)
1268 break; 1266 break;
1269 } 1267 }
1270 mutex_unlock(&shmem_swaplist_mutex); 1268 mutex_unlock(&shmem_swaplist_mutex);
1271 1269
1272 if (prev_inode)
1273 iput(prev_inode);
1274
1275 return error; 1270 return error;
1276} 1271}
1277 1272
@@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
2238 info = SHMEM_I(inode); 2233 info = SHMEM_I(inode);
2239 memset(info, 0, (char *)inode - (char *)info); 2234 memset(info, 0, (char *)inode - (char *)info);
2240 spin_lock_init(&info->lock); 2235 spin_lock_init(&info->lock);
2236 atomic_set(&info->stop_eviction, 0);
2241 info->seals = F_SEAL_SEAL; 2237 info->seals = F_SEAL_SEAL;
2242 info->flags = flags & VM_NORESERVE; 2238 info->flags = flags & VM_NORESERVE;
2243 INIT_LIST_HEAD(&info->shrinklist); 2239 INIT_LIST_HEAD(&info->shrinklist);