diff options
| author | Eric Dumazet <edumazet@google.com> | 2015-06-30 09:54:08 -0400 |
|---|---|---|
| committer | Al Viro <viro@zeniv.linux.org.uk> | 2015-07-01 02:30:09 -0400 |
| commit | 8a81252b774b53e628a8a0fe18e2b8fc236d92cc (patch) | |
| tree | 2176cd782e2b578da7ac1072f4430bf68e22c274 | |
| parent | 1af95de6f0119d5bde02d3a811a9f3a3661e954e (diff) | |
fs/file.c: don't acquire files->file_lock in fd_install()
Mateusz Guzik reported :
Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.
Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.
Mateusz provided an RFC patch and a micro benchmark :
http://people.redhat.com/~mguzik/pipebench.c
A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.
We can use RCU instead of the spinlock.
__fd_install() must wait if a resize is in progress.
The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())
resize should be attempted by a single thread to not waste resources.
rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.
It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R)
CPU E5-2696 v2 @ 2.50GHz
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Mateusz Guzik <mguzik@redhat.com>
Tested-by: Mateusz Guzik <mguzik@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
| -rw-r--r-- | Documentation/filesystems/porting | 4 | ||||
| -rw-r--r-- | fs/file.c | 67 | ||||
| -rw-r--r-- | include/linux/fdtable.h | 3 |
3 files changed, 55 insertions, 19 deletions
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 3eae250254d5..ec5456113072 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
| @@ -500,3 +500,7 @@ in your dentry operations instead. | |||
| 500 | dentry, it does not get nameidata at all and it gets called only when cookie | 500 | dentry, it does not get nameidata at all and it gets called only when cookie |
| 501 | is non-NULL. Note that link body isn't available anymore, so if you need it, | 501 | is non-NULL. Note that link body isn't available anymore, so if you need it, |
| 502 | store it as cookie. | 502 | store it as cookie. |
| 503 | -- | ||
| 504 | [mandatory] | ||
| 505 | __fd_install() & fd_install() can now sleep. Callers should not | ||
| 506 | hold a spinlock or other resources that do not allow a schedule. | ||
| @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) | |||
| 147 | 147 | ||
| 148 | spin_unlock(&files->file_lock); | 148 | spin_unlock(&files->file_lock); |
| 149 | new_fdt = alloc_fdtable(nr); | 149 | new_fdt = alloc_fdtable(nr); |
| 150 | |||
| 151 | /* make sure all __fd_install() have seen resize_in_progress | ||
| 152 | * or have finished their rcu_read_lock_sched() section. | ||
| 153 | */ | ||
| 154 | if (atomic_read(&files->count) > 1) | ||
| 155 | synchronize_sched(); | ||
| 156 | |||
| 150 | spin_lock(&files->file_lock); | 157 | spin_lock(&files->file_lock); |
| 151 | if (!new_fdt) | 158 | if (!new_fdt) |
| 152 | return -ENOMEM; | 159 | return -ENOMEM; |
| @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) | |||
| 158 | __free_fdtable(new_fdt); | 165 | __free_fdtable(new_fdt); |
| 159 | return -EMFILE; | 166 | return -EMFILE; |
| 160 | } | 167 | } |
| 161 | /* | ||
| 162 | * Check again since another task may have expanded the fd table while | ||
| 163 | * we dropped the lock | ||
| 164 | */ | ||
| 165 | cur_fdt = files_fdtable(files); | 168 | cur_fdt = files_fdtable(files); |
| 166 | if (nr >= cur_fdt->max_fds) { | 169 | BUG_ON(nr < cur_fdt->max_fds); |
| 167 | /* Continue as planned */ | 170 | copy_fdtable(new_fdt, cur_fdt); |
| 168 | copy_fdtable(new_fdt, cur_fdt); | 171 | rcu_assign_pointer(files->fdt, new_fdt); |
| 169 | rcu_assign_pointer(files->fdt, new_fdt); | 172 | if (cur_fdt != &files->fdtab) |
| 170 | if (cur_fdt != &files->fdtab) | 173 | call_rcu(&cur_fdt->rcu, free_fdtable_rcu); |
| 171 | call_rcu(&cur_fdt->rcu, free_fdtable_rcu); | 174 | /* coupled with smp_rmb() in __fd_install() */ |
| 172 | } else { | 175 | smp_wmb(); |
| 173 | /* Somebody else expanded, so undo our attempt */ | ||
| 174 | __free_fdtable(new_fdt); | ||
| 175 | } | ||
| 176 | return 1; | 176 | return 1; |
| 177 | } | 177 | } |
| 178 | 178 | ||
| @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) | |||
| 185 | * The files->file_lock should be held on entry, and will be held on exit. | 185 | * The files->file_lock should be held on entry, and will be held on exit. |
| 186 | */ | 186 | */ |
| 187 | static int expand_files(struct files_struct *files, int nr) | 187 | static int expand_files(struct files_struct *files, int nr) |
| 188 | __releases(files->file_lock) | ||
| 189 | __acquires(files->file_lock) | ||
| 188 | { | 190 | { |
| 189 | struct fdtable *fdt; | 191 | struct fdtable *fdt; |
| 192 | int expanded = 0; | ||
| 190 | 193 | ||
| 194 | repeat: | ||
| 191 | fdt = files_fdtable(files); | 195 | fdt = files_fdtable(files); |
| 192 | 196 | ||
| 193 | /* Do we need to expand? */ | 197 | /* Do we need to expand? */ |
| 194 | if (nr < fdt->max_fds) | 198 | if (nr < fdt->max_fds) |
| 195 | return 0; | 199 | return expanded; |
| 196 | 200 | ||
| 197 | /* Can we expand? */ | 201 | /* Can we expand? */ |
| 198 | if (nr >= sysctl_nr_open) | 202 | if (nr >= sysctl_nr_open) |
| 199 | return -EMFILE; | 203 | return -EMFILE; |
| 200 | 204 | ||
| 205 | if (unlikely(files->resize_in_progress)) { | ||
| 206 | spin_unlock(&files->file_lock); | ||
| 207 | expanded = 1; | ||
| 208 | wait_event(files->resize_wait, !files->resize_in_progress); | ||
| 209 | spin_lock(&files->file_lock); | ||
| 210 | goto repeat; | ||
| 211 | } | ||
| 212 | |||
| 201 | /* All good, so we try */ | 213 | /* All good, so we try */ |
| 202 | return expand_fdtable(files, nr); | 214 | files->resize_in_progress = true; |
| 215 | expanded = expand_fdtable(files, nr); | ||
| 216 | files->resize_in_progress = false; | ||
| 217 | |||
| 218 | wake_up_all(&files->resize_wait); | ||
| 219 | return expanded; | ||
| 203 | } | 220 | } |
| 204 | 221 | ||
| 205 | static inline void __set_close_on_exec(int fd, struct fdtable *fdt) | 222 | static inline void __set_close_on_exec(int fd, struct fdtable *fdt) |
| @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
| 256 | atomic_set(&newf->count, 1); | 273 | atomic_set(&newf->count, 1); |
| 257 | 274 | ||
| 258 | spin_lock_init(&newf->file_lock); | 275 | spin_lock_init(&newf->file_lock); |
| 276 | newf->resize_in_progress = false; | ||
| 277 | init_waitqueue_head(&newf->resize_wait); | ||
| 259 | newf->next_fd = 0; | 278 | newf->next_fd = 0; |
| 260 | new_fdt = &newf->fdtab; | 279 | new_fdt = &newf->fdtab; |
| 261 | new_fdt->max_fds = NR_OPEN_DEFAULT; | 280 | new_fdt->max_fds = NR_OPEN_DEFAULT; |
| @@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, | |||
| 553 | struct file *file) | 572 | struct file *file) |
| 554 | { | 573 | { |
| 555 | struct fdtable *fdt; | 574 | struct fdtable *fdt; |
| 556 | spin_lock(&files->file_lock); | 575 | |
| 557 | fdt = files_fdtable(files); | 576 | might_sleep(); |
| 577 | rcu_read_lock_sched(); | ||
| 578 | |||
| 579 | while (unlikely(files->resize_in_progress)) { | ||
| 580 | rcu_read_unlock_sched(); | ||
| 581 | wait_event(files->resize_wait, !files->resize_in_progress); | ||
| 582 | rcu_read_lock_sched(); | ||
| 583 | } | ||
| 584 | /* coupled with smp_wmb() in expand_fdtable() */ | ||
| 585 | smp_rmb(); | ||
| 586 | fdt = rcu_dereference_sched(files->fdt); | ||
| 558 | BUG_ON(fdt->fd[fd] != NULL); | 587 | BUG_ON(fdt->fd[fd] != NULL); |
| 559 | rcu_assign_pointer(fdt->fd[fd], file); | 588 | rcu_assign_pointer(fdt->fd[fd], file); |
| 560 | spin_unlock(&files->file_lock); | 589 | rcu_read_unlock_sched(); |
| 561 | } | 590 | } |
| 562 | 591 | ||
| 563 | void fd_install(unsigned int fd, struct file *file) | 592 | void fd_install(unsigned int fd, struct file *file) |
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 230f87bdf5ad..fbb88740634a 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h | |||
| @@ -47,6 +47,9 @@ struct files_struct { | |||
| 47 | * read mostly part | 47 | * read mostly part |
| 48 | */ | 48 | */ |
| 49 | atomic_t count; | 49 | atomic_t count; |
| 50 | bool resize_in_progress; | ||
| 51 | wait_queue_head_t resize_wait; | ||
| 52 | |||
| 50 | struct fdtable __rcu *fdt; | 53 | struct fdtable __rcu *fdt; |
| 51 | struct fdtable fdtab; | 54 | struct fdtable fdtab; |
| 52 | /* | 55 | /* |
