summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-06-30 09:54:08 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2015-07-01 02:30:09 -0400
commit8a81252b774b53e628a8a0fe18e2b8fc236d92cc (patch)
tree2176cd782e2b578da7ac1072f4430bf68e22c274
parent1af95de6f0119d5bde02d3a811a9f3a3661e954e (diff)
fs/file.c: don't acquire files->file_lock in fd_install()
Mateusz Guzik reported : Currently obtaining a new file descriptor results in locking fdtable twice - once in order to reserve a slot and second time to fill it. Holding the spinlock in __fd_install() is needed in case a resize is done, or to prevent a resize. Mateusz provided an RFC patch and a micro benchmark : http://people.redhat.com/~mguzik/pipebench.c A resize is an unlikely operation in a process lifetime, as table size is at least doubled at every resize. We can use RCU instead of the spinlock. __fd_install() must wait if a resize is in progress. The resize must block new __fd_install() callers from starting, and wait that ongoing install are finished (synchronize_sched()) resize should be attempted by a single thread to not waste resources. rcu_sched variant is used, as __fd_install() and expand_fdtable() run from process context. It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R) CPU E5-2696 v2 @ 2.50GHz Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Mateusz Guzik <mguzik@redhat.com> Acked-by: Mateusz Guzik <mguzik@redhat.com> Tested-by: Mateusz Guzik <mguzik@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--Documentation/filesystems/porting4
-rw-r--r--fs/file.c67
-rw-r--r--include/linux/fdtable.h3
3 files changed, 55 insertions, 19 deletions
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 3eae250254d5..ec5456113072 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -500,3 +500,7 @@ in your dentry operations instead.
500 dentry, it does not get nameidata at all and it gets called only when cookie 500 dentry, it does not get nameidata at all and it gets called only when cookie
501 is non-NULL. Note that link body isn't available anymore, so if you need it, 501 is non-NULL. Note that link body isn't available anymore, so if you need it,
502 store it as cookie. 502 store it as cookie.
503--
504[mandatory]
505 __fd_install() & fd_install() can now sleep. Callers should not
506 hold a spinlock or other resources that do not allow a schedule.
diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..3d2eb4c542a4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
147 147
148 spin_unlock(&files->file_lock); 148 spin_unlock(&files->file_lock);
149 new_fdt = alloc_fdtable(nr); 149 new_fdt = alloc_fdtable(nr);
150
151 /* make sure all __fd_install() have seen resize_in_progress
152 * or have finished their rcu_read_lock_sched() section.
153 */
154 if (atomic_read(&files->count) > 1)
155 synchronize_sched();
156
150 spin_lock(&files->file_lock); 157 spin_lock(&files->file_lock);
151 if (!new_fdt) 158 if (!new_fdt)
152 return -ENOMEM; 159 return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
158 __free_fdtable(new_fdt); 165 __free_fdtable(new_fdt);
159 return -EMFILE; 166 return -EMFILE;
160 } 167 }
161 /*
162 * Check again since another task may have expanded the fd table while
163 * we dropped the lock
164 */
165 cur_fdt = files_fdtable(files); 168 cur_fdt = files_fdtable(files);
166 if (nr >= cur_fdt->max_fds) { 169 BUG_ON(nr < cur_fdt->max_fds);
167 /* Continue as planned */ 170 copy_fdtable(new_fdt, cur_fdt);
168 copy_fdtable(new_fdt, cur_fdt); 171 rcu_assign_pointer(files->fdt, new_fdt);
169 rcu_assign_pointer(files->fdt, new_fdt); 172 if (cur_fdt != &files->fdtab)
170 if (cur_fdt != &files->fdtab) 173 call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
171 call_rcu(&cur_fdt->rcu, free_fdtable_rcu); 174 /* coupled with smp_rmb() in __fd_install() */
172 } else { 175 smp_wmb();
173 /* Somebody else expanded, so undo our attempt */
174 __free_fdtable(new_fdt);
175 }
176 return 1; 176 return 1;
177} 177}
178 178
@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
185 * The files->file_lock should be held on entry, and will be held on exit. 185 * The files->file_lock should be held on entry, and will be held on exit.
186 */ 186 */
187static int expand_files(struct files_struct *files, int nr) 187static int expand_files(struct files_struct *files, int nr)
188 __releases(files->file_lock)
189 __acquires(files->file_lock)
188{ 190{
189 struct fdtable *fdt; 191 struct fdtable *fdt;
192 int expanded = 0;
190 193
194repeat:
191 fdt = files_fdtable(files); 195 fdt = files_fdtable(files);
192 196
193 /* Do we need to expand? */ 197 /* Do we need to expand? */
194 if (nr < fdt->max_fds) 198 if (nr < fdt->max_fds)
195 return 0; 199 return expanded;
196 200
197 /* Can we expand? */ 201 /* Can we expand? */
198 if (nr >= sysctl_nr_open) 202 if (nr >= sysctl_nr_open)
199 return -EMFILE; 203 return -EMFILE;
200 204
205 if (unlikely(files->resize_in_progress)) {
206 spin_unlock(&files->file_lock);
207 expanded = 1;
208 wait_event(files->resize_wait, !files->resize_in_progress);
209 spin_lock(&files->file_lock);
210 goto repeat;
211 }
212
201 /* All good, so we try */ 213 /* All good, so we try */
202 return expand_fdtable(files, nr); 214 files->resize_in_progress = true;
215 expanded = expand_fdtable(files, nr);
216 files->resize_in_progress = false;
217
218 wake_up_all(&files->resize_wait);
219 return expanded;
203} 220}
204 221
205static inline void __set_close_on_exec(int fd, struct fdtable *fdt) 222static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
256 atomic_set(&newf->count, 1); 273 atomic_set(&newf->count, 1);
257 274
258 spin_lock_init(&newf->file_lock); 275 spin_lock_init(&newf->file_lock);
276 newf->resize_in_progress = false;
277 init_waitqueue_head(&newf->resize_wait);
259 newf->next_fd = 0; 278 newf->next_fd = 0;
260 new_fdt = &newf->fdtab; 279 new_fdt = &newf->fdtab;
261 new_fdt->max_fds = NR_OPEN_DEFAULT; 280 new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
553 struct file *file) 572 struct file *file)
554{ 573{
555 struct fdtable *fdt; 574 struct fdtable *fdt;
556 spin_lock(&files->file_lock); 575
557 fdt = files_fdtable(files); 576 might_sleep();
577 rcu_read_lock_sched();
578
579 while (unlikely(files->resize_in_progress)) {
580 rcu_read_unlock_sched();
581 wait_event(files->resize_wait, !files->resize_in_progress);
582 rcu_read_lock_sched();
583 }
584 /* coupled with smp_wmb() in expand_fdtable() */
585 smp_rmb();
586 fdt = rcu_dereference_sched(files->fdt);
558 BUG_ON(fdt->fd[fd] != NULL); 587 BUG_ON(fdt->fd[fd] != NULL);
559 rcu_assign_pointer(fdt->fd[fd], file); 588 rcu_assign_pointer(fdt->fd[fd], file);
560 spin_unlock(&files->file_lock); 589 rcu_read_unlock_sched();
561} 590}
562 591
563void fd_install(unsigned int fd, struct file *file) 592void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
47 * read mostly part 47 * read mostly part
48 */ 48 */
49 atomic_t count; 49 atomic_t count;
50 bool resize_in_progress;
51 wait_queue_head_t resize_wait;
52
50 struct fdtable __rcu *fdt; 53 struct fdtable __rcu *fdt;
51 struct fdtable fdtab; 54 struct fdtable fdtab;
52 /* 55 /*