diff options
author | Tejun Heo <tj@kernel.org> | 2014-03-04 15:38:46 -0500 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2014-03-09 01:08:29 -0500 |
commit | b7ce40cff0b9f6597f8318fd761accd92727f61f (patch) | |
tree | 03f743474c481d9c9c5f931675dba0d79a546ba8 | |
parent | 92d585ef067da7a966d6ce78c601bd1562b62619 (diff) |
kernfs: cache atomic_write_len in kernfs_open_file
While implementing atomic_write_len, 4d3773c4bb41 ("kernfs: implement
kernfs_ops->atomic_write_len") moved data copy from userland inside
kernfs_get_active() and kernfs_open_file->mutex so that
kernfs_ops->atomic_write_len can be accessed before copying buffer
from userland; unfortunately, this could lead to locking order
inversion involving mmap_sem if copy_from_user() takes a page fault.
======================================================
[ INFO: possible circular locking dependency detected ]
3.14.0-rc4-next-20140228-sasha-00011-g4077c67-dirty #26 Tainted: G W
-------------------------------------------------------
trinity-c236/10658 is trying to acquire lock:
(&of->mutex#2){+.+.+.}, at: [<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
but task is already holding lock:
(&mm->mmap_sem){++++++}, at: [<mm/util.c:397>] vm_mmap_pgoff+0x6e/0xe0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&mm->mmap_sem){++++++}:
[<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
[<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
[<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
[<mm/memory.c:4188>] might_fault+0x7e/0xb0
[<arch/x86/include/asm/uaccess.h:713 fs/kernfs/file.c:291>] kernfs_fop_write+0xd8/0x190
[<fs/read_write.c:473>] vfs_write+0xe3/0x1d0
[<fs/read_write.c:523 fs/read_write.c:515>] SyS_write+0x5d/0xa0
[<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2
-> #0 (&of->mutex#2){+.+.+.}:
[<kernel/locking/lockdep.c:1840>] check_prev_add+0x13f/0x560
[<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
[<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
[<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
[<kernel/locking/mutex.c:470 kernel/locking/mutex.c:571>] mutex_lock_nested+0x6a/0x510
[<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
[<mm/mmap.c:1573>] mmap_region+0x310/0x5c0
[<mm/mmap.c:1365>] do_mmap_pgoff+0x385/0x430
[<mm/util.c:399>] vm_mmap_pgoff+0x8f/0xe0
[<mm/mmap.c:1416 mm/mmap.c:1374>] SyS_mmap_pgoff+0x1b0/0x210
[<arch/x86/kernel/sys_x86_64.c:72>] SyS_mmap+0x1d/0x20
[<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&mm->mmap_sem);
lock(&of->mutex#2);
lock(&mm->mmap_sem);
lock(&of->mutex#2);
*** DEADLOCK ***
1 lock held by trinity-c236/10658:
#0: (&mm->mmap_sem){++++++}, at: [<mm/util.c:397>] vm_mmap_pgoff+0x6e/0xe0
stack backtrace:
CPU: 2 PID: 10658 Comm: trinity-c236 Tainted: G W 3.14.0-rc4-next-20140228-sasha-00011-g4077c67-dirty #26
0000000000000000 ffff88011911fa48 ffffffff8438e945 0000000000000000
0000000000000000 ffff88011911fa98 ffffffff811a0109 ffff88011911fab8
ffff88011911fab8 ffff88011911fa98 ffff880119128cc0 ffff880119128cf8
Call Trace:
[<lib/dump_stack.c:52>] dump_stack+0x52/0x7f
[<kernel/locking/lockdep.c:1213>] print_circular_bug+0x129/0x160
[<kernel/locking/lockdep.c:1840>] check_prev_add+0x13f/0x560
[<include/linux/spinlock.h:343 mm/slub.c:1933>] ? deactivate_slab+0x511/0x550
[<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
[<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
[<mm/mmap.c:1552>] ? mmap_region+0x24a/0x5c0
[<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
[<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
[<kernel/locking/mutex.c:470 kernel/locking/mutex.c:571>] mutex_lock_nested+0x6a/0x510
[<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
[<kernel/sched/core.c:2477>] ? get_parent_ip+0x11/0x50
[<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
[<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
[<mm/mmap.c:1573>] mmap_region+0x310/0x5c0
[<mm/mmap.c:1365>] do_mmap_pgoff+0x385/0x430
[<mm/util.c:397>] ? vm_mmap_pgoff+0x6e/0xe0
[<mm/util.c:399>] vm_mmap_pgoff+0x8f/0xe0
[<kernel/rcu/update.c:97>] ? __rcu_read_unlock+0x44/0xb0
[<fs/file.c:641>] ? dup_fd+0x3c0/0x3c0
[<mm/mmap.c:1416 mm/mmap.c:1374>] SyS_mmap_pgoff+0x1b0/0x210
[<arch/x86/kernel/sys_x86_64.c:72>] SyS_mmap+0x1d/0x20
[<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2
Fix it by caching atomic_write_len in kernfs_open_file during open so
that it can be determined without accessing kernfs_ops in
kernfs_fop_write(). This restores the structure of kernfs_fop_write()
before 4d3773c4bb41 with updated @len determination logic.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
References: http://lkml.kernel.org/g/53113485.2090407@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | fs/kernfs/file.c | 63 | ||||
-rw-r--r-- | include/linux/kernfs.h | 1 |
2 files changed, 33 insertions, 31 deletions
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ddcb471b9cc9..8034706a7af8 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c | |||
@@ -253,55 +253,50 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, | |||
253 | { | 253 | { |
254 | struct kernfs_open_file *of = kernfs_of(file); | 254 | struct kernfs_open_file *of = kernfs_of(file); |
255 | const struct kernfs_ops *ops; | 255 | const struct kernfs_ops *ops; |
256 | char *buf = NULL; | 256 | size_t len; |
257 | ssize_t len; | 257 | char *buf; |
258 | |||
259 | /* | ||
260 | * @of->mutex nests outside active ref and is just to ensure that | ||
261 | * the ops aren't called concurrently for the same open file. | ||
262 | */ | ||
263 | mutex_lock(&of->mutex); | ||
264 | if (!kernfs_get_active(of->kn)) { | ||
265 | mutex_unlock(&of->mutex); | ||
266 | return -ENODEV; | ||
267 | } | ||
268 | |||
269 | ops = kernfs_ops(of->kn); | ||
270 | if (!ops->write) { | ||
271 | len = -EINVAL; | ||
272 | goto out_unlock; | ||
273 | } | ||
274 | 258 | ||
275 | if (ops->atomic_write_len) { | 259 | if (of->atomic_write_len) { |
276 | len = count; | 260 | len = count; |
277 | if (len > ops->atomic_write_len) { | 261 | if (len > of->atomic_write_len) |
278 | len = -E2BIG; | 262 | return -E2BIG; |
279 | goto out_unlock; | ||
280 | } | ||
281 | } else { | 263 | } else { |
282 | len = min_t(size_t, count, PAGE_SIZE); | 264 | len = min_t(size_t, count, PAGE_SIZE); |
283 | } | 265 | } |
284 | 266 | ||
285 | buf = kmalloc(len + 1, GFP_KERNEL); | 267 | buf = kmalloc(len + 1, GFP_KERNEL); |
286 | if (!buf) { | 268 | if (!buf) |
287 | len = -ENOMEM; | 269 | return -ENOMEM; |
288 | goto out_unlock; | ||
289 | } | ||
290 | 270 | ||
291 | if (copy_from_user(buf, user_buf, len)) { | 271 | if (copy_from_user(buf, user_buf, len)) { |
292 | len = -EFAULT; | 272 | len = -EFAULT; |
293 | goto out_unlock; | 273 | goto out_free; |
294 | } | 274 | } |
295 | buf[len] = '\0'; /* guarantee string termination */ | 275 | buf[len] = '\0'; /* guarantee string termination */ |
296 | 276 | ||
297 | len = ops->write(of, buf, len, *ppos); | 277 | /* |
298 | out_unlock: | 278 | * @of->mutex nests outside active ref and is just to ensure that |
279 | * the ops aren't called concurrently for the same open file. | ||
280 | */ | ||
281 | mutex_lock(&of->mutex); | ||
282 | if (!kernfs_get_active(of->kn)) { | ||
283 | mutex_unlock(&of->mutex); | ||
284 | len = -ENODEV; | ||
285 | goto out_free; | ||
286 | } | ||
287 | |||
288 | ops = kernfs_ops(of->kn); | ||
289 | if (ops->write) | ||
290 | len = ops->write(of, buf, len, *ppos); | ||
291 | else | ||
292 | len = -EINVAL; | ||
293 | |||
299 | kernfs_put_active(of->kn); | 294 | kernfs_put_active(of->kn); |
300 | mutex_unlock(&of->mutex); | 295 | mutex_unlock(&of->mutex); |
301 | 296 | ||
302 | if (len > 0) | 297 | if (len > 0) |
303 | *ppos += len; | 298 | *ppos += len; |
304 | 299 | out_free: | |
305 | kfree(buf); | 300 | kfree(buf); |
306 | return len; | 301 | return len; |
307 | } | 302 | } |
@@ -666,6 +661,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) | |||
666 | of->file = file; | 661 | of->file = file; |
667 | 662 | ||
668 | /* | 663 | /* |
664 | * Write path needs to atomic_write_len outside active reference. | ||
665 | * Cache it in open_file. See kernfs_fop_write() for details. | ||
666 | */ | ||
667 | of->atomic_write_len = ops->atomic_write_len; | ||
668 | |||
669 | /* | ||
669 | * Always instantiate seq_file even if read access doesn't use | 670 | * Always instantiate seq_file even if read access doesn't use |
670 | * seq_file or is not requested. This unifies private data access | 671 | * seq_file or is not requested. This unifies private data access |
671 | * and readable regular files are the vast majority anyway. | 672 | * and readable regular files are the vast majority anyway. |
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 09669d092748..b0122dc6f96a 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h | |||
@@ -158,6 +158,7 @@ struct kernfs_open_file { | |||
158 | int event; | 158 | int event; |
159 | struct list_head list; | 159 | struct list_head list; |
160 | 160 | ||
161 | size_t atomic_write_len; | ||
161 | bool mmapped; | 162 | bool mmapped; |
162 | const struct vm_operations_struct *vm_ops; | 163 | const struct vm_operations_struct *vm_ops; |
163 | }; | 164 | }; |