diff options
author | Hugh Dickins <hughd@google.com> | 2011-05-11 18:13:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-11 21:50:45 -0400 |
commit | b1dea800ac39599301d4bb8dcf2b1d29c2558211 (patch) | |
tree | c5a650a0d41936dac47aa42e554e1ebc3029e926 | |
parent | 21a3c9646873ae0919415d635b671d6a58758ede (diff) |
tmpfs: fix race between umount and writepage
Konstanin Khlebnikov reports that a dangerous race between umount and
shmem_writepage can be reproduced by this script:
for i in {1..300} ; do
mkdir $i
while true ; do
mount -t tmpfs none $i
dd if=/dev/zero of=$i/test bs=1M count=$(($RANDOM % 100))
umount $i
done &
done
on a 6xCPU node with 8Gb RAM: kernel very unstable after this accident. =)
Kernel log:
VFS: Busy inodes after unmount of tmpfs.
Self-destruct in 5 seconds. Have a nice day...
WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98()
list_del corruption. prev->next should be ffff880222fdaac8, but was (null)
Pid: 11222, comm: mount.tmpfs Not tainted 2.6.39-rc2+ #4
Call Trace:
warn_slowpath_common+0x80/0x98
warn_slowpath_fmt+0x41/0x43
__list_del_entry+0x8d/0x98
evict+0x50/0x113
iput+0x138/0x141
...
BUG: unable to handle kernel paging request at ffffffffffffffff
IP: shmem_free_blocks+0x18/0x4c
Pid: 10422, comm: dd Tainted: G W 2.6.39-rc2+ #4
Call Trace:
shmem_recalc_inode+0x61/0x66
shmem_writepage+0xba/0x1dc
pageout+0x13c/0x24c
shrink_page_list+0x28e/0x4be
shrink_inactive_list+0x21f/0x382
...
shmem_writepage() calls igrab() on the inode for the page which came from
page reclaim, to add it later into shmem_swaplist for swapoff operation.
This igrab() can race with super-block deactivating process:
shrink_inactive_list() deactivate_super()
pageout() tmpfs_fs_type->kill_sb()
shmem_writepage() kill_litter_super()
generic_shutdown_super()
evict_inodes()
igrab()
atomic_read(&inode->i_count)
skip-inode
iput()
if (!list_empty(&sb->s_inodes))
printk("VFS: Busy inodes after...
This igrap-iput pair was added in commit 1b1b32f2c6f6 "tmpfs: fix
shmem_swaplist races" based on incorrect assumptions: igrab() protects the
inode from concurrent eviction by deletion, but it does nothing to protect
it from concurrent unmounting, which goes ahead despite the raised
i_count.
So this use of igrab() was wrong all along, but the race made much worse
in 2.6.37 when commit 63997e98a3be "split invalidate_inodes()" replaced
two attempts at invalidate_inodes() by a single evict_inodes().
Konstantin posted a plausible patch, raising sb->s_active too: I'm unsure
whether it was correct or not; but burnt once by igrab(), I am sure that
we don't want to rely more deeply upon externals here.
Fix it by adding the inode to shmem_swaplist earlier, while the page lock
on page in page cache still secures the inode against eviction, without
artifically raising i_count. It was originally added later because
shmem_unuse_inode() is liable to remove an inode from the list while it's
unswapped; but we can guard against that by taking spinlock before
dropping mutex.
Reported-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Tested-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/shmem.c | 31 |
1 files changed, 20 insertions, 11 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 8fa27e4e582a..262d71173447 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1039,6 +1039,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1039 | struct address_space *mapping; | 1039 | struct address_space *mapping; |
1040 | unsigned long index; | 1040 | unsigned long index; |
1041 | struct inode *inode; | 1041 | struct inode *inode; |
1042 | bool unlock_mutex = false; | ||
1042 | 1043 | ||
1043 | BUG_ON(!PageLocked(page)); | 1044 | BUG_ON(!PageLocked(page)); |
1044 | mapping = page->mapping; | 1045 | mapping = page->mapping; |
@@ -1064,7 +1065,26 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1064 | else | 1065 | else |
1065 | swap.val = 0; | 1066 | swap.val = 0; |
1066 | 1067 | ||
1068 | /* | ||
1069 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | ||
1070 | * if it's not already there. Do it now because we cannot take | ||
1071 | * mutex while holding spinlock, and must do so before the page | ||
1072 | * is moved to swap cache, when its pagelock no longer protects | ||
1073 | * the inode from eviction. But don't unlock the mutex until | ||
1074 | * we've taken the spinlock, because shmem_unuse_inode() will | ||
1075 | * prune a !swapped inode from the swaplist under both locks. | ||
1076 | */ | ||
1077 | if (swap.val && list_empty(&info->swaplist)) { | ||
1078 | mutex_lock(&shmem_swaplist_mutex); | ||
1079 | /* move instead of add in case we're racing */ | ||
1080 | list_move_tail(&info->swaplist, &shmem_swaplist); | ||
1081 | unlock_mutex = true; | ||
1082 | } | ||
1083 | |||
1067 | spin_lock(&info->lock); | 1084 | spin_lock(&info->lock); |
1085 | if (unlock_mutex) | ||
1086 | mutex_unlock(&shmem_swaplist_mutex); | ||
1087 | |||
1068 | if (index >= info->next_index) { | 1088 | if (index >= info->next_index) { |
1069 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | 1089 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); |
1070 | goto unlock; | 1090 | goto unlock; |
@@ -1084,21 +1104,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1084 | delete_from_page_cache(page); | 1104 | delete_from_page_cache(page); |
1085 | shmem_swp_set(info, entry, swap.val); | 1105 | shmem_swp_set(info, entry, swap.val); |
1086 | shmem_swp_unmap(entry); | 1106 | shmem_swp_unmap(entry); |
1087 | if (list_empty(&info->swaplist)) | ||
1088 | inode = igrab(inode); | ||
1089 | else | ||
1090 | inode = NULL; | ||
1091 | spin_unlock(&info->lock); | 1107 | spin_unlock(&info->lock); |
1092 | swap_shmem_alloc(swap); | 1108 | swap_shmem_alloc(swap); |
1093 | BUG_ON(page_mapped(page)); | 1109 | BUG_ON(page_mapped(page)); |
1094 | swap_writepage(page, wbc); | 1110 | swap_writepage(page, wbc); |
1095 | if (inode) { | ||
1096 | mutex_lock(&shmem_swaplist_mutex); | ||
1097 | /* move instead of add in case we're racing */ | ||
1098 | list_move_tail(&info->swaplist, &shmem_swaplist); | ||
1099 | mutex_unlock(&shmem_swaplist_mutex); | ||
1100 | iput(inode); | ||
1101 | } | ||
1102 | return 0; | 1111 | return 0; |
1103 | } | 1112 | } |
1104 | 1113 | ||