aboutsummaryrefslogtreecommitdiffstats
path: root/fs/notify
diff options
context:
space:
mode:
authorLino Sanfilippo <LinoSanfilippo@gmx.de>2011-08-11 19:13:31 -0400
committerEric Paris <eparis@redhat.com>2012-12-11 13:44:36 -0500
commit6960b0d909cde5bdff49e4e5c1250edd10be7ebd (patch)
tree722f06d4c8a815606f5b4383b909cd8c55087c2f /fs/notify
parent64c20d2a20fce295c260ea6cb3b468edfa2fb07b (diff)
fsnotify: change locking order
On Mon, Aug 01, 2011 at 04:38:22PM -0400, Eric Paris wrote: > > I finally built and tested a v3.0 kernel with these patches (I know I'm > SOOOOOO far behind). Not what I hoped for: > > > [ 150.937798] VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day... > > [ 150.945290] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070 > > [ 150.946012] IP: [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50 > > [ 150.946012] PGD 2bf9e067 PUD 2bf9f067 PMD 0 > > [ 150.946012] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC > > [ 150.946012] CPU 0 > > [ 150.946012] Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ext4 jbd2 crc16 joydev ata_piix i2c_piix4 pcspkr uinput ipv6 autofs4 usbhid [last unloaded: scsi_wait_scan] > > [ 150.946012] > > [ 150.946012] Pid: 2764, comm: syscall_thrash Not tainted 3.0.0+ #1 Red Hat KVM > > [ 150.946012] RIP: 0010:[<ffffffff810ffd58>] [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50 > > [ 150.946012] RSP: 0018:ffff88002c2e5df8 EFLAGS: 00010282 > > [ 150.946012] RAX: 000000004e370d9f RBX: 0000000000000000 RCX: ffff88003a029438 > > [ 150.946012] RDX: 0000000033630a5f RSI: 0000000000000000 RDI: ffff88003491c240 > > [ 150.946012] RBP: ffff88002c2e5e08 R08: 0000000000000000 R09: 0000000000000000 > > [ 150.946012] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a029428 > > [ 150.946012] R13: ffff88003a029428 R14: ffff88003a029428 R15: ffff88003499a610 > > [ 150.946012] FS: 00007f5a05420700(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000 > > [ 150.946012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > > [ 150.946012] CR2: 0000000000000070 CR3: 000000002a662000 CR4: 00000000000006f0 > > [ 150.946012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > [ 150.946012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > > [ 150.946012] Process syscall_thrash (pid: 2764, threadinfo ffff88002c2e4000, task ffff88002bfbc760) > > [ 150.946012] Stack: > > [ 150.946012] ffff88003a029438 ffff88003a029428 ffff88002c2e5e38 ffffffff81102f76 > > [ 150.946012] ffff88003a029438 ffff88003a029598 ffffffff8160f9c0 ffff88002c221250 > > [ 150.946012] ffff88002c2e5e68 ffffffff8115e9be ffff88002c2e5e68 ffff88003a029438 > > [ 150.946012] Call Trace: > > [ 150.946012] [<ffffffff81102f76>] shmem_evict_inode+0x76/0x130 > > [ 150.946012] [<ffffffff8115e9be>] evict+0x7e/0x170 > > [ 150.946012] [<ffffffff8115ee40>] iput_final+0xd0/0x190 > > [ 150.946012] [<ffffffff8115ef33>] iput+0x33/0x40 > > [ 150.946012] [<ffffffff81180205>] fsnotify_destroy_mark_locked+0x145/0x160 > > [ 150.946012] [<ffffffff81180316>] fsnotify_destroy_mark+0x36/0x50 > > [ 150.946012] [<ffffffff81181937>] sys_inotify_rm_watch+0x77/0xd0 > > [ 150.946012] [<ffffffff815aca52>] system_call_fastpath+0x16/0x1b > > [ 150.946012] Code: 67 4a 00 b8 e4 ff ff ff eb aa 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 8b 9f 40 05 00 00 > > [ 150.946012] 83 7b 70 00 74 1c 4c 8d a3 80 00 00 00 4c 89 e7 e8 d2 5d 4a > > [ 150.946012] RIP [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50 > > [ 150.946012] RSP <ffff88002c2e5df8> > > [ 150.946012] CR2: 0000000000000070 > > Looks at aweful lot like the problem from: > http://www.spinics.net/lists/linux-fsdevel/msg46101.html > I tried to reproduce this bug with your test program, but without success. However, if I understand correctly, this occurs since we dont hold any locks when we call iput() in mark_destroy(), right? With the patches you tested, iput() is also not called within any lock, since the groups mark_mutex is released temporarily before iput() is called. This is, since the original codes behaviour is similar. However since we now have a mutex as the biggest lock, we can do what you suggested (http://www.spinics.net/lists/linux-fsdevel/msg46107.html) and call iput() with the mutex held to avoid the race. The patch below implements this. It uses nested locking to avoid deadlock in case we do the final iput() on an inode which still holds marks and thus would take the mutex again when calling fsnotify_inode_delete() in destroy_inode(). Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de> Signed-off-by: Eric Paris <eparis@redhat.com>
Diffstat (limited to 'fs/notify')
-rw-r--r--fs/notify/mark.c20
1 files changed, 10 insertions, 10 deletions
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 0e93d90bb753..fc6b49bf7360 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -150,6 +150,8 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
150 150
151 spin_unlock(&mark->lock); 151 spin_unlock(&mark->lock);
152 152
153 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
154 iput(inode);
153 /* release lock temporarily */ 155 /* release lock temporarily */
154 mutex_unlock(&group->mark_mutex); 156 mutex_unlock(&group->mark_mutex);
155 157
@@ -157,6 +159,11 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
157 list_add(&mark->destroy_list, &destroy_list); 159 list_add(&mark->destroy_list, &destroy_list);
158 spin_unlock(&destroy_lock); 160 spin_unlock(&destroy_lock);
159 wake_up(&destroy_waitq); 161 wake_up(&destroy_waitq);
162 /*
163 * We don't necessarily have a ref on mark from caller so the above destroy
164 * may have actually freed it, unless this group provides a 'freeing_mark'
165 * function which must be holding a reference.
166 */
160 167
161 /* 168 /*
162 * Some groups like to know that marks are being freed. This is a 169 * Some groups like to know that marks are being freed. This is a
@@ -178,22 +185,15 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
178 * is just a lazy update (and could be a perf win...) 185 * is just a lazy update (and could be a perf win...)
179 */ 186 */
180 187
181 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
182 iput(inode);
183 /*
184 * We don't necessarily have a ref on mark from caller so the above iput
185 * may have already destroyed it. Don't touch from now on.
186 */
187
188 atomic_dec(&group->num_marks); 188 atomic_dec(&group->num_marks);
189 189
190 mutex_lock(&group->mark_mutex); 190 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
191} 191}
192 192
193void fsnotify_destroy_mark(struct fsnotify_mark *mark, 193void fsnotify_destroy_mark(struct fsnotify_mark *mark,
194 struct fsnotify_group *group) 194 struct fsnotify_group *group)
195{ 195{
196 mutex_lock(&group->mark_mutex); 196 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
197 fsnotify_destroy_mark_locked(mark, group); 197 fsnotify_destroy_mark_locked(mark, group);
198 mutex_unlock(&group->mark_mutex); 198 mutex_unlock(&group->mark_mutex);
199} 199}
@@ -300,7 +300,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
300{ 300{
301 struct fsnotify_mark *lmark, *mark; 301 struct fsnotify_mark *lmark, *mark;
302 302
303 mutex_lock(&group->mark_mutex); 303 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
304 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { 304 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
305 if (mark->flags & flags) { 305 if (mark->flags & flags) {
306 fsnotify_get_mark(mark); 306 fsnotify_get_mark(mark);