aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-03-03 12:36:58 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2014-03-10 11:44:41 -0400
commit9c225f2655e36a470c4f58dbbc99244c5fc7f2d4 (patch)
tree7cb89dbc82ee1b533ff2d097fed6a4248374bd4b
parent1b56e98990bcdbb20b9fab163654b9315bf158e8 (diff)
vfs: atomic f_pos accesses as per POSIX
Our write() system call has always been atomic in the sense that you get the expected thread-safe contiguous write, but we haven't actually guaranteed that concurrent writes are serialized wrt f_pos accesses, so threads (or processes) that share a file descriptor and use "write()" concurrently would quite likely overwrite each others data. This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says: "2.9.7 Thread Interactions with Regular File Operations All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: [...]" and one of the effects is the file position update. This unprotected file position behavior is not new behavior, and nobody has ever cared. Until now. Yongzhi Pan reported unexpected behavior to Michael Kerrisk that was due to this. This resolves the issue with a f_pos-specific lock that is taken by read/write/lseek on file descriptors that may be shared across threads or processes. Reported-by: Yongzhi Pan <panyongzhi@gmail.com> Reported-by: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/namei.c2
-rw-r--r--fs/open.c4
-rw-r--r--fs/read_write.c54
-rw-r--r--include/linux/file.h6
-rw-r--r--include/linux/fs.h6
6 files changed, 55 insertions, 18 deletions
diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
135 atomic_long_set(&f->f_count, 1); 135 atomic_long_set(&f->f_count, 1);
136 rwlock_init(&f->f_owner.lock); 136 rwlock_init(&f->f_owner.lock);
137 spin_lock_init(&f->f_lock); 137 spin_lock_init(&f->f_lock);
138 mutex_init(&f->f_pos_lock);
138 eventpoll_init_file(f); 139 eventpoll_init_file(f);
139 /* f->f_version: 0 */ 140 /* f->f_version: 0 */
140 return f; 141 return f;
diff --git a/fs/namei.c b/fs/namei.c
index 385f7817bfcc..2f730ef9b4b3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1884 1884
1885 nd->path = f.file->f_path; 1885 nd->path = f.file->f_path;
1886 if (flags & LOOKUP_RCU) { 1886 if (flags & LOOKUP_RCU) {
1887 if (f.need_put) 1887 if (f.flags & FDPUT_FPUT)
1888 *fp = f.file; 1888 *fp = f.file;
1889 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1889 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1890 rcu_read_lock(); 1890 rcu_read_lock();
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
705 return 0; 705 return 0;
706 } 706 }
707 707
708 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
709 if (S_ISREG(inode->i_mode))
710 f->f_mode |= FMODE_ATOMIC_POS;
711
708 f->f_op = fops_get(inode->i_fop); 712 f->f_op = fops_get(inode->i_fop);
709 if (unlikely(WARN_ON(!f->f_op))) { 713 if (unlikely(WARN_ON(!f->f_op))) {
710 error = -ENODEV; 714 error = -ENODEV;
diff --git a/fs/read_write.c b/fs/read_write.c
index edc5746a902a..932bb3414a96 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
264} 264}
265EXPORT_SYMBOL(vfs_llseek); 265EXPORT_SYMBOL(vfs_llseek);
266 266
267/*
268 * We only lock f_pos if we have threads or if the file might be
269 * shared with another process. In both cases we'll have an elevated
270 * file count (done either by fdget() or by fork()).
271 */
272static inline struct fd fdget_pos(int fd)
273{
274 struct fd f = fdget(fd);
275 struct file *file = f.file;
276
277 if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
278 if (file_count(file) > 1) {
279 f.flags |= FDPUT_POS_UNLOCK;
280 mutex_lock(&file->f_pos_lock);
281 }
282 }
283 return f;
284}
285
286static inline void fdput_pos(struct fd f)
287{
288 if (f.flags & FDPUT_POS_UNLOCK)
289 mutex_unlock(&f.file->f_pos_lock);
290 fdput(f);
291}
292
267SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 293SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
268{ 294{
269 off_t retval; 295 off_t retval;
270 struct fd f = fdget(fd); 296 struct fd f = fdget_pos(fd);
271 if (!f.file) 297 if (!f.file)
272 return -EBADF; 298 return -EBADF;
273 299
@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
278 if (res != (loff_t)retval) 304 if (res != (loff_t)retval)
279 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 305 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
280 } 306 }
281 fdput(f); 307 fdput_pos(f);
282 return retval; 308 return retval;
283} 309}
284 310
@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
498 524
499SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 525SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
500{ 526{
501 struct fd f = fdget(fd); 527 struct fd f = fdget_pos(fd);
502 ssize_t ret = -EBADF; 528 ssize_t ret = -EBADF;
503 529
504 if (f.file) { 530 if (f.file) {
@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
506 ret = vfs_read(f.file, buf, count, &pos); 532 ret = vfs_read(f.file, buf, count, &pos);
507 if (ret >= 0) 533 if (ret >= 0)
508 file_pos_write(f.file, pos); 534 file_pos_write(f.file, pos);
509 fdput(f); 535 fdput_pos(f);
510 } 536 }
511 return ret; 537 return ret;
512} 538}
@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
514SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 540SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
515 size_t, count) 541 size_t, count)
516{ 542{
517 struct fd f = fdget(fd); 543 struct fd f = fdget_pos(fd);
518 ssize_t ret = -EBADF; 544 ssize_t ret = -EBADF;
519 545
520 if (f.file) { 546 if (f.file) {
@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
522 ret = vfs_write(f.file, buf, count, &pos); 548 ret = vfs_write(f.file, buf, count, &pos);
523 if (ret >= 0) 549 if (ret >= 0)
524 file_pos_write(f.file, pos); 550 file_pos_write(f.file, pos);
525 fdput(f); 551 fdput_pos(f);
526 } 552 }
527 553
528 return ret; 554 return ret;
@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
797SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 823SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
798 unsigned long, vlen) 824 unsigned long, vlen)
799{ 825{
800 struct fd f = fdget(fd); 826 struct fd f = fdget_pos(fd);
801 ssize_t ret = -EBADF; 827 ssize_t ret = -EBADF;
802 828
803 if (f.file) { 829 if (f.file) {
@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
805 ret = vfs_readv(f.file, vec, vlen, &pos); 831 ret = vfs_readv(f.file, vec, vlen, &pos);
806 if (ret >= 0) 832 if (ret >= 0)
807 file_pos_write(f.file, pos); 833 file_pos_write(f.file, pos);
808 fdput(f); 834 fdput_pos(f);
809 } 835 }
810 836
811 if (ret > 0) 837 if (ret > 0)
@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
817SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 843SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
818 unsigned long, vlen) 844 unsigned long, vlen)
819{ 845{
820 struct fd f = fdget(fd); 846 struct fd f = fdget_pos(fd);
821 ssize_t ret = -EBADF; 847 ssize_t ret = -EBADF;
822 848
823 if (f.file) { 849 if (f.file) {
@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
825 ret = vfs_writev(f.file, vec, vlen, &pos); 851 ret = vfs_writev(f.file, vec, vlen, &pos);
826 if (ret >= 0) 852 if (ret >= 0)
827 file_pos_write(f.file, pos); 853 file_pos_write(f.file, pos);
828 fdput(f); 854 fdput_pos(f);
829 } 855 }
830 856
831 if (ret > 0) 857 if (ret > 0)
@@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
968 const struct compat_iovec __user *,vec, 994 const struct compat_iovec __user *,vec,
969 compat_ulong_t, vlen) 995 compat_ulong_t, vlen)
970{ 996{
971 struct fd f = fdget(fd); 997 struct fd f = fdget_pos(fd);
972 ssize_t ret; 998 ssize_t ret;
973 loff_t pos; 999 loff_t pos;
974 1000
@@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
978 ret = compat_readv(f.file, vec, vlen, &pos); 1004 ret = compat_readv(f.file, vec, vlen, &pos);
979 if (ret >= 0) 1005 if (ret >= 0)
980 f.file->f_pos = pos; 1006 f.file->f_pos = pos;
981 fdput(f); 1007 fdput_pos(f);
982 return ret; 1008 return ret;
983} 1009}
984 1010
@@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1035 const struct compat_iovec __user *, vec, 1061 const struct compat_iovec __user *, vec,
1036 compat_ulong_t, vlen) 1062 compat_ulong_t, vlen)
1037{ 1063{
1038 struct fd f = fdget(fd); 1064 struct fd f = fdget_pos(fd);
1039 ssize_t ret; 1065 ssize_t ret;
1040 loff_t pos; 1066 loff_t pos;
1041 1067
@@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1045 ret = compat_writev(f.file, vec, vlen, &pos); 1071 ret = compat_writev(f.file, vec, vlen, &pos);
1046 if (ret >= 0) 1072 if (ret >= 0)
1047 f.file->f_pos = pos; 1073 f.file->f_pos = pos;
1048 fdput(f); 1074 fdput_pos(f);
1049 return ret; 1075 return ret;
1050} 1076}
1051 1077
diff --git a/include/linux/file.h b/include/linux/file.h
index cbacf4faf447..f2517fa2d610 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed)
28 28
29struct fd { 29struct fd {
30 struct file *file; 30 struct file *file;
31 int need_put; 31 unsigned int flags;
32}; 32};
33#define FDPUT_FPUT 1
34#define FDPUT_POS_UNLOCK 2
33 35
34static inline void fdput(struct fd fd) 36static inline void fdput(struct fd fd)
35{ 37{
36 if (fd.need_put) 38 if (fd.flags & FDPUT_FPUT)
37 fput(fd.file); 39 fput(fd.file);
38} 40}
39 41
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60829565e552..ebfde04bca06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
123/* File is opened with O_PATH; almost nothing can be done with it */ 123/* File is opened with O_PATH; almost nothing can be done with it */
124#define FMODE_PATH ((__force fmode_t)0x4000) 124#define FMODE_PATH ((__force fmode_t)0x4000)
125 125
126/* File needs atomic accesses to f_pos */
127#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
128
126/* File was opened by fanotify and shouldn't generate fanotify events */ 129/* File was opened by fanotify and shouldn't generate fanotify events */
127#define FMODE_NONOTIFY ((__force fmode_t)0x1000000) 130#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
128 131
@@ -780,13 +783,14 @@ struct file {
780 const struct file_operations *f_op; 783 const struct file_operations *f_op;
781 784
782 /* 785 /*
783 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. 786 * Protects f_ep_links, f_flags.
784 * Must not be taken from IRQ context. 787 * Must not be taken from IRQ context.
785 */ 788 */
786 spinlock_t f_lock; 789 spinlock_t f_lock;
787 atomic_long_t f_count; 790 atomic_long_t f_count;
788 unsigned int f_flags; 791 unsigned int f_flags;
789 fmode_t f_mode; 792 fmode_t f_mode;
793 struct mutex f_pos_lock;
790 loff_t f_pos; 794 loff_t f_pos;
791 struct fown_struct f_owner; 795 struct fown_struct f_owner;
792 const struct cred *f_cred; 796 const struct cred *f_cred;