aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2011-05-11 18:13:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-11 21:50:45 -0400
commitb1dea800ac39599301d4bb8dcf2b1d29c2558211 (patch)
treec5a650a0d41936dac47aa42e554e1ebc3029e926 /mm
parent21a3c9646873ae0919415d635b671d6a58758ede (diff)
tmpfs: fix race between umount and writepage
Konstanin Khlebnikov reports that a dangerous race between umount and shmem_writepage can be reproduced by this script: for i in {1..300} ; do mkdir $i while true ; do mount -t tmpfs none $i dd if=/dev/zero of=$i/test bs=1M count=$(($RANDOM % 100)) umount $i done & done on a 6xCPU node with 8Gb RAM: kernel very unstable after this accident. =) Kernel log: VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day... WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() list_del corruption. prev->next should be ffff880222fdaac8, but was (null) Pid: 11222, comm: mount.tmpfs Not tainted 2.6.39-rc2+ #4 Call Trace: warn_slowpath_common+0x80/0x98 warn_slowpath_fmt+0x41/0x43 __list_del_entry+0x8d/0x98 evict+0x50/0x113 iput+0x138/0x141 ... BUG: unable to handle kernel paging request at ffffffffffffffff IP: shmem_free_blocks+0x18/0x4c Pid: 10422, comm: dd Tainted: G W 2.6.39-rc2+ #4 Call Trace: shmem_recalc_inode+0x61/0x66 shmem_writepage+0xba/0x1dc pageout+0x13c/0x24c shrink_page_list+0x28e/0x4be shrink_inactive_list+0x21f/0x382 ... shmem_writepage() calls igrab() on the inode for the page which came from page reclaim, to add it later into shmem_swaplist for swapoff operation. This igrab() can race with super-block deactivating process: shrink_inactive_list() deactivate_super() pageout() tmpfs_fs_type->kill_sb() shmem_writepage() kill_litter_super() generic_shutdown_super() evict_inodes() igrab() atomic_read(&inode->i_count) skip-inode iput() if (!list_empty(&sb->s_inodes)) printk("VFS: Busy inodes after... This igrap-iput pair was added in commit 1b1b32f2c6f6 "tmpfs: fix shmem_swaplist races" based on incorrect assumptions: igrab() protects the inode from concurrent eviction by deletion, but it does nothing to protect it from concurrent unmounting, which goes ahead despite the raised i_count. So this use of igrab() was wrong all along, but the race made much worse in 2.6.37 when commit 63997e98a3be "split invalidate_inodes()" replaced two attempts at invalidate_inodes() by a single evict_inodes(). Konstantin posted a plausible patch, raising sb->s_active too: I'm unsure whether it was correct or not; but burnt once by igrab(), I am sure that we don't want to rely more deeply upon externals here. Fix it by adding the inode to shmem_swaplist earlier, while the page lock on page in page cache still secures the inode against eviction, without artifically raising i_count. It was originally added later because shmem_unuse_inode() is liable to remove an inode from the list while it's unswapped; but we can guard against that by taking spinlock before dropping mutex. Reported-by: Konstantin Khlebnikov <khlebnikov@openvz.org> Signed-off-by: Hugh Dickins <hughd@google.com> Tested-by: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/shmem.c31
1 files changed, 20 insertions, 11 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 8fa27e4e582a..262d71173447 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1039,6 +1039,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1039 struct address_space *mapping; 1039 struct address_space *mapping;
1040 unsigned long index; 1040 unsigned long index;
1041 struct inode *inode; 1041 struct inode *inode;
1042 bool unlock_mutex = false;
1042 1043
1043 BUG_ON(!PageLocked(page)); 1044 BUG_ON(!PageLocked(page));
1044 mapping = page->mapping; 1045 mapping = page->mapping;
@@ -1064,7 +1065,26 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1064 else 1065 else
1065 swap.val = 0; 1066 swap.val = 0;
1066 1067
1068 /*
1069 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1070 * if it's not already there. Do it now because we cannot take
1071 * mutex while holding spinlock, and must do so before the page
1072 * is moved to swap cache, when its pagelock no longer protects
1073 * the inode from eviction. But don't unlock the mutex until
1074 * we've taken the spinlock, because shmem_unuse_inode() will
1075 * prune a !swapped inode from the swaplist under both locks.
1076 */
1077 if (swap.val && list_empty(&info->swaplist)) {
1078 mutex_lock(&shmem_swaplist_mutex);
1079 /* move instead of add in case we're racing */
1080 list_move_tail(&info->swaplist, &shmem_swaplist);
1081 unlock_mutex = true;
1082 }
1083
1067 spin_lock(&info->lock); 1084 spin_lock(&info->lock);
1085 if (unlock_mutex)
1086 mutex_unlock(&shmem_swaplist_mutex);
1087
1068 if (index >= info->next_index) { 1088 if (index >= info->next_index) {
1069 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1089 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1070 goto unlock; 1090 goto unlock;
@@ -1084,21 +1104,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1084 delete_from_page_cache(page); 1104 delete_from_page_cache(page);
1085 shmem_swp_set(info, entry, swap.val); 1105 shmem_swp_set(info, entry, swap.val);
1086 shmem_swp_unmap(entry); 1106 shmem_swp_unmap(entry);
1087 if (list_empty(&info->swaplist))
1088 inode = igrab(inode);
1089 else
1090 inode = NULL;
1091 spin_unlock(&info->lock); 1107 spin_unlock(&info->lock);
1092 swap_shmem_alloc(swap); 1108 swap_shmem_alloc(swap);
1093 BUG_ON(page_mapped(page)); 1109 BUG_ON(page_mapped(page));
1094 swap_writepage(page, wbc); 1110 swap_writepage(page, wbc);
1095 if (inode) {
1096 mutex_lock(&shmem_swaplist_mutex);
1097 /* move instead of add in case we're racing */
1098 list_move_tail(&info->swaplist, &shmem_swaplist);
1099 mutex_unlock(&shmem_swaplist_mutex);
1100 iput(inode);
1101 }
1102 return 0; 1111 return 0;
1103 } 1112 }
1104 1113