aboutsummaryrefslogtreecommitdiffstats
path: root/fs/read_write.c
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2011-09-15 19:06:48 -0400
committerroot <root@serles.lst.de>2011-10-28 08:58:58 -0400
commitef3d0fd27e90f67e35da516dafc1482c82939a60 (patch)
treedea852eab2a52782867becffb11bce2577ed2b91 /fs/read_write.c
parent847cc6371ba820763773e993000410d6d8d23515 (diff)
vfs: do (nearly) lockless generic_file_llseek
The i_mutex lock use of generic _file_llseek hurts. Independent processes accessing the same file synchronize over a single lock, even though they have no need for synchronization at all. Under high utilization this can cause llseek to scale very poorly on larger systems. This patch does some rethinking of the llseek locking model: First the 64bit f_pos is not necessarily atomic without locks on 32bit systems. This can already cause races with read() today. This was discussed on linux-kernel in the past and deemed acceptable. The patch does not change that. Let's look at the different seek variants: SEEK_SET: Doesn't really need any locking. If there's a race one writer wins, the other loses. For 32bit the non atomic update races against read() stay the same. Without a lock they can also happen against write() now. The read() race was deemed acceptable in past discussions, and I think if it's ok for read it's ok for write too. => Don't need a lock. SEEK_END: This behaves like SEEK_SET plus it reads the maximum size too. Reading the maximum size would have the 32bit atomic problem. But luckily we already have a way to read the maximum size without locking (i_size_read), so we can just use that instead. Without i_mutex there is no synchronization with write() anymore, however since the write() update is atomic on 64bit it just behaves like another racy SEEK_SET. On non atomic 32bit it's the same as SEEK_SET. => Don't need a lock, but need to use i_size_read() SEEK_CUR: This has a read-modify-write race window on the same file. One could argue that any application doing unsynchronized seeks on the same file is already broken. But for the sake of not adding a regression here I'm using the file->f_lock to synchronize this. Using this lock is much better than the inode mutex because it doesn't synchronize between processes. => So still need a lock, but can use a f_lock. This patch implements this new scheme in generic_file_llseek. I dropped generic_file_llseek_unlocked and changed all callers. Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/read_write.c')
-rw-r--r--fs/read_write.c85
1 files changed, 41 insertions, 44 deletions
diff --git a/fs/read_write.c b/fs/read_write.c
index 179f1c33ea57..672b187def62 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file)
35 return file->f_mode & FMODE_UNSIGNED_OFFSET; 35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
36} 36}
37 37
38static loff_t lseek_execute(struct file *file, struct inode *inode,
39 loff_t offset, loff_t maxsize)
40{
41 if (offset < 0 && !unsigned_offsets(file))
42 return -EINVAL;
43 if (offset > maxsize)
44 return -EINVAL;
45
46 if (offset != file->f_pos) {
47 file->f_pos = offset;
48 file->f_version = 0;
49 }
50 return offset;
51}
52
38/** 53/**
39 * generic_file_llseek_unlocked - lockless generic llseek implementation 54 * generic_file_llseek - generic llseek implementation for regular files
40 * @file: file structure to seek on 55 * @file: file structure to seek on
41 * @offset: file offset to seek to 56 * @offset: file offset to seek to
42 * @origin: type of seek 57 * @origin: type of seek
43 * 58 *
44 * Updates the file offset to the value specified by @offset and @origin. 59 * This is a generic implemenation of ->llseek usable for all normal local
45 * Locking must be provided by the caller. 60 * filesystems. It just updates the file offset to the value specified by
61 * @offset and @origin under i_mutex.
62 *
63 * Synchronization:
64 * SEEK_SET is unsynchronized (but atomic on 64bit platforms)
65 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
66 * read/writes behave like SEEK_SET against seeks.
67 * SEEK_END
46 */ 68 */
47loff_t 69loff_t
48generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) 70generic_file_llseek(struct file *file, loff_t offset, int origin)
49{ 71{
50 struct inode *inode = file->f_mapping->host; 72 struct inode *inode = file->f_mapping->host;
51 73
52 switch (origin) { 74 switch (origin) {
53 case SEEK_END: 75 case SEEK_END:
54 offset += inode->i_size; 76 offset += i_size_read(inode);
55 break; 77 break;
56 case SEEK_CUR: 78 case SEEK_CUR:
57 /* 79 /*
@@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 */ 84 */
63 if (offset == 0) 85 if (offset == 0)
64 return file->f_pos; 86 return file->f_pos;
65 offset += file->f_pos; 87 /*
66 break; 88 * f_lock protects against read/modify/write race with other
89 * SEEK_CURs. Note that parallel writes and reads behave
90 * like SEEK_SET.
91 */
92 spin_lock(&file->f_lock);
93 offset = lseek_execute(file, inode, file->f_pos + offset,
94 inode->i_sb->s_maxbytes);
95 spin_unlock(&file->f_lock);
96 return offset;
67 case SEEK_DATA: 97 case SEEK_DATA:
68 /* 98 /*
69 * In the generic case the entire file is data, so as long as 99 * In the generic case the entire file is data, so as long as
70 * offset isn't at the end of the file then the offset is data. 100 * offset isn't at the end of the file then the offset is data.
71 */ 101 */
72 if (offset >= inode->i_size) 102 if (offset >= i_size_read(inode))
73 return -ENXIO; 103 return -ENXIO;
74 break; 104 break;
75 case SEEK_HOLE: 105 case SEEK_HOLE:
@@ -77,46 +107,13 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
77 * There is a virtual hole at the end of the file, so as long as 107 * There is a virtual hole at the end of the file, so as long as
78 * offset isn't i_size or larger, return i_size. 108 * offset isn't i_size or larger, return i_size.
79 */ 109 */
80 if (offset >= inode->i_size) 110 if (offset >= i_size_read(inode))
81 return -ENXIO; 111 return -ENXIO;
82 offset = inode->i_size; 112 offset = i_size_read(inode);
83 break; 113 break;
84 } 114 }
85 115
86 if (offset < 0 && !unsigned_offsets(file)) 116 return lseek_execute(file, inode, offset, inode->i_sb->s_maxbytes);
87 return -EINVAL;
88 if (offset > inode->i_sb->s_maxbytes)
89 return -EINVAL;
90
91 /* Special lock needed here? */
92 if (offset != file->f_pos) {
93 file->f_pos = offset;
94 file->f_version = 0;
95 }
96
97 return offset;
98}
99EXPORT_SYMBOL(generic_file_llseek_unlocked);
100
101/**
102 * generic_file_llseek - generic llseek implementation for regular files
103 * @file: file structure to seek on
104 * @offset: file offset to seek to
105 * @origin: type of seek
106 *
107 * This is a generic implemenation of ->llseek useable for all normal local
108 * filesystems. It just updates the file offset to the value specified by
109 * @offset and @origin under i_mutex.
110 */
111loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
112{
113 loff_t rval;
114
115 mutex_lock(&file->f_dentry->d_inode->i_mutex);
116 rval = generic_file_llseek_unlocked(file, offset, origin);
117 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
118
119 return rval;
120} 117}
121EXPORT_SYMBOL(generic_file_llseek); 118EXPORT_SYMBOL(generic_file_llseek);
122 119