From badf16621c1f9d1ac753be056fce11b43d6e0be5 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Fri, 9 Sep 2005 13:04:10 -0700 Subject: [PATCH] files: break up files struct In order for the RCU to work, the file table array, sets and their sizes must be updated atomically. Instead of ensuring this through too many memory barriers, we put the arrays and their sizes in a separate structure. This patch takes the first step of putting the file table elements in a separate structure fdtable that is embedded withing files_struct. It also changes all the users to refer to the file table using files_fdtable() macro. Subsequent applciation of RCU becomes easier after this. Signed-off-by: Dipankar Sarma Signed-Off-By: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'fs/file.c') diff --git a/fs/file.c b/fs/file.c index 92b5f25985d2..f5926ce73f37 100644 --- a/fs/file.c +++ b/fs/file.c @@ -59,13 +59,15 @@ static int expand_fd_array(struct files_struct *files, int nr) { struct file **new_fds; int error, nfds; + struct fdtable *fdt; error = -EMFILE; - if (files->max_fds >= NR_OPEN || nr >= NR_OPEN) + fdt = files_fdtable(files); + if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) goto out; - nfds = files->max_fds; + nfds = fdt->max_fds; spin_unlock(&files->file_lock); /* @@ -95,13 +97,14 @@ static int expand_fd_array(struct files_struct *files, int nr) goto out; /* Copy the existing array and install the new pointer */ + fdt = files_fdtable(files); - if (nfds > files->max_fds) { + if (nfds > fdt->max_fds) { struct file **old_fds; int i; - old_fds = xchg(&files->fd, new_fds); - i = xchg(&files->max_fds, nfds); + old_fds = xchg(&fdt->fd, new_fds); + i = xchg(&fdt->max_fds, nfds); /* Don't copy/clear the array if we are creating a new fd array for fork() */ @@ -164,12 +167,14 @@ static int expand_fdset(struct files_struct *files, int nr) { fd_set *new_openset = NULL, *new_execset = NULL; int error, nfds = 0; + struct fdtable *fdt; error = -EMFILE; - if (files->max_fdset >= NR_OPEN || nr >= NR_OPEN) + fdt = files_fdtable(files); + if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN) goto out; - nfds = files->max_fdset; + nfds = fdt->max_fdset; spin_unlock(&files->file_lock); /* Expand to the max in easy steps */ @@ -193,24 +198,25 @@ static int expand_fdset(struct files_struct *files, int nr) error = 0; /* Copy the existing tables and install the new pointers */ - if (nfds > files->max_fdset) { - int i = files->max_fdset / (sizeof(unsigned long) * 8); - int count = (nfds - files->max_fdset) / 8; + fdt = files_fdtable(files); + if (nfds > fdt->max_fdset) { + int i = fdt->max_fdset / (sizeof(unsigned long) * 8); + int count = (nfds - fdt->max_fdset) / 8; /* * Don't copy the entire array if the current fdset is * not yet initialised. */ if (i) { - memcpy (new_openset, files->open_fds, files->max_fdset/8); - memcpy (new_execset, files->close_on_exec, files->max_fdset/8); + memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8); + memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8); memset (&new_openset->fds_bits[i], 0, count); memset (&new_execset->fds_bits[i], 0, count); } - nfds = xchg(&files->max_fdset, nfds); - new_openset = xchg(&files->open_fds, new_openset); - new_execset = xchg(&files->close_on_exec, new_execset); + nfds = xchg(&fdt->max_fdset, nfds); + new_openset = xchg(&fdt->open_fds, new_openset); + new_execset = xchg(&fdt->close_on_exec, new_execset); spin_unlock(&files->file_lock); free_fdset (new_openset, nfds); free_fdset (new_execset, nfds); @@ -237,13 +243,15 @@ out: int expand_files(struct files_struct *files, int nr) { int err, expand = 0; + struct fdtable *fdt; - if (nr >= files->max_fdset) { + fdt = files_fdtable(files); + if (nr >= fdt->max_fdset) { expand = 1; if ((err = expand_fdset(files, nr))) goto out; } - if (nr >= files->max_fds) { + if (nr >= fdt->max_fds) { expand = 1; if ((err = expand_fd_array(files, nr))) goto out; -- cgit v1.2.2 From ab2af1f5005069321c5d130f09cce577b03f43ef Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Fri, 9 Sep 2005 13:04:13 -0700 Subject: [PATCH] files: files struct with RCU Patch to eliminate struct files_struct.file_lock spinlock on the reader side and use rcu refcounting rcuref_xxx api for the f_count refcounter. The updates to the fdtable are done by allocating a new fdtable structure and setting files->fdt to point to the new structure. The fdtable structure is protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A global list of fdtable to be freed is not scalable, so we use a per-cpu list. If keventd is already handling the current cpu's work, we use a timer to defer queueing of that work. Since the last publication, this patch has been re-written to avoid using explicit memory barriers and use rcu_assign_pointer(), rcu_dereference() premitives instead. This required that the fd information is kept in a separate structure (fdtable) and updated atomically. Signed-off-by: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 389 +++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 258 insertions(+), 131 deletions(-) (limited to 'fs/file.c') diff --git a/fs/file.c b/fs/file.c index f5926ce73f37..2127a7b9dc3a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -13,6 +13,25 @@ #include #include #include +#include +#include +#include +#include + +struct fdtable_defer { + spinlock_t lock; + struct work_struct wq; + struct timer_list timer; + struct fdtable *next; +}; + +/* + * We use this list to defer free fdtables that have vmalloced + * sets/arrays. By keeping a per-cpu list, we avoid having to embed + * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in + * this per-task structure. + */ +static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); /* @@ -48,85 +67,143 @@ void free_fd_array(struct file **array, int num) vfree(array); } -/* - * Expand the fd array in the files_struct. Called with the files - * spinlock held for write. - */ - -static int expand_fd_array(struct files_struct *files, int nr) - __releases(files->file_lock) - __acquires(files->file_lock) +static void __free_fdtable(struct fdtable *fdt) { - struct file **new_fds; - int error, nfds; - struct fdtable *fdt; + int fdset_size, fdarray_size; - - error = -EMFILE; - fdt = files_fdtable(files); - if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) - goto out; + fdset_size = fdt->max_fdset / 8; + fdarray_size = fdt->max_fds * sizeof(struct file *); + free_fdset(fdt->open_fds, fdset_size); + free_fdset(fdt->close_on_exec, fdset_size); + free_fd_array(fdt->fd, fdarray_size); + kfree(fdt); +} - nfds = fdt->max_fds; - spin_unlock(&files->file_lock); +static void fdtable_timer(unsigned long data) +{ + struct fdtable_defer *fddef = (struct fdtable_defer *)data; - /* - * Expand to the max in easy steps, and keep expanding it until - * we have enough for the requested fd array size. + spin_lock(&fddef->lock); + /* + * If someone already emptied the queue return. */ + if (!fddef->next) + goto out; + if (!schedule_work(&fddef->wq)) + mod_timer(&fddef->timer, 5); +out: + spin_unlock(&fddef->lock); +} - do { -#if NR_OPEN_DEFAULT < 256 - if (nfds < 256) - nfds = 256; - else -#endif - if (nfds < (PAGE_SIZE / sizeof(struct file *))) - nfds = PAGE_SIZE / sizeof(struct file *); - else { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } - } while (nfds <= nr); +static void free_fdtable_work(struct fdtable_defer *f) +{ + struct fdtable *fdt; - error = -ENOMEM; - new_fds = alloc_fd_array(nfds); - spin_lock(&files->file_lock); - if (!new_fds) - goto out; + spin_lock_bh(&f->lock); + fdt = f->next; + f->next = NULL; + spin_unlock_bh(&f->lock); + while(fdt) { + struct fdtable *next = fdt->next; + __free_fdtable(fdt); + fdt = next; + } +} - /* Copy the existing array and install the new pointer */ - fdt = files_fdtable(files); +static void free_fdtable_rcu(struct rcu_head *rcu) +{ + struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); + int fdset_size, fdarray_size; + struct fdtable_defer *fddef; - if (nfds > fdt->max_fds) { - struct file **old_fds; - int i; - - old_fds = xchg(&fdt->fd, new_fds); - i = xchg(&fdt->max_fds, nfds); - - /* Don't copy/clear the array if we are creating a new - fd array for fork() */ - if (i) { - memcpy(new_fds, old_fds, i * sizeof(struct file *)); - /* clear the remainder of the array */ - memset(&new_fds[i], 0, - (nfds-i) * sizeof(struct file *)); - - spin_unlock(&files->file_lock); - free_fd_array(old_fds, i); - spin_lock(&files->file_lock); - } + BUG_ON(!fdt); + fdset_size = fdt->max_fdset / 8; + fdarray_size = fdt->max_fds * sizeof(struct file *); + + if (fdt->free_files) { + /* + * The this fdtable was embedded in the files structure + * and the files structure itself was getting destroyed. + * It is now safe to free the files structure. + */ + kmem_cache_free(files_cachep, fdt->free_files); + return; + } + if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) { + /* + * The fdtable was embedded + */ + return; + } + if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { + kfree(fdt->open_fds); + kfree(fdt->close_on_exec); + kfree(fdt->fd); + kfree(fdt); } else { - /* Somebody expanded the array while we slept ... */ - spin_unlock(&files->file_lock); - free_fd_array(new_fds, nfds); - spin_lock(&files->file_lock); + fddef = &get_cpu_var(fdtable_defer_list); + spin_lock(&fddef->lock); + fdt->next = fddef->next; + fddef->next = fdt; + /* + * vmallocs are handled from the workqueue context. + * If the per-cpu workqueue is running, then we + * defer work scheduling through a timer. + */ + if (!schedule_work(&fddef->wq)) + mod_timer(&fddef->timer, 5); + spin_unlock(&fddef->lock); + put_cpu_var(fdtable_defer_list); } - error = 0; -out: - return error; +} + +void free_fdtable(struct fdtable *fdt) +{ + if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE || + fdt->max_fds > NR_OPEN_DEFAULT) + call_rcu(&fdt->rcu, free_fdtable_rcu); +} + +/* + * Expand the fdset in the files_struct. Called with the files spinlock + * held for write. + */ +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) +{ + int i; + int count; + + BUG_ON(nfdt->max_fdset < fdt->max_fdset); + BUG_ON(nfdt->max_fds < fdt->max_fds); + /* Copy the existing tables and install the new pointers */ + + i = fdt->max_fdset / (sizeof(unsigned long) * 8); + count = (nfdt->max_fdset - fdt->max_fdset) / 8; + + /* + * Don't copy the entire array if the current fdset is + * not yet initialised. + */ + if (i) { + memcpy (nfdt->open_fds, fdt->open_fds, + fdt->max_fdset/8); + memcpy (nfdt->close_on_exec, fdt->close_on_exec, + fdt->max_fdset/8); + memset (&nfdt->open_fds->fds_bits[i], 0, count); + memset (&nfdt->close_on_exec->fds_bits[i], 0, count); + } + + /* Don't copy/clear the array if we are creating a new + fd array for fork() */ + if (fdt->max_fds) { + memcpy(nfdt->fd, fdt->fd, + fdt->max_fds * sizeof(struct file *)); + /* clear the remainder of the array */ + memset(&nfdt->fd[fdt->max_fds], 0, + (nfdt->max_fds - fdt->max_fds) * + sizeof(struct file *)); + } + nfdt->next_fd = fdt->next_fd; } /* @@ -157,28 +234,21 @@ void free_fdset(fd_set *array, int num) vfree(array); } -/* - * Expand the fdset in the files_struct. Called with the files spinlock - * held for write. - */ -static int expand_fdset(struct files_struct *files, int nr) - __releases(file->file_lock) - __acquires(file->file_lock) +static struct fdtable *alloc_fdtable(int nr) { - fd_set *new_openset = NULL, *new_execset = NULL; - int error, nfds = 0; - struct fdtable *fdt; - - error = -EMFILE; - fdt = files_fdtable(files); - if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN) - goto out; + struct fdtable *fdt = NULL; + int nfds = 0; + fd_set *new_openset = NULL, *new_execset = NULL; + struct file **new_fds; - nfds = fdt->max_fdset; - spin_unlock(&files->file_lock); + fdt = kmalloc(sizeof(*fdt), GFP_KERNEL); + if (!fdt) + goto out; + memset(fdt, 0, sizeof(*fdt)); - /* Expand to the max in easy steps */ - do { + nfds = __FD_SETSIZE; + /* Expand to the max in easy steps */ + do { if (nfds < (PAGE_SIZE * 8)) nfds = PAGE_SIZE * 8; else { @@ -188,50 +258,88 @@ static int expand_fdset(struct files_struct *files, int nr) } } while (nfds <= nr); - error = -ENOMEM; - new_openset = alloc_fdset(nfds); - new_execset = alloc_fdset(nfds); - spin_lock(&files->file_lock); - if (!new_openset || !new_execset) + new_openset = alloc_fdset(nfds); + new_execset = alloc_fdset(nfds); + if (!new_openset || !new_execset) + goto out; + fdt->open_fds = new_openset; + fdt->close_on_exec = new_execset; + fdt->max_fdset = nfds; + + nfds = NR_OPEN_DEFAULT; + /* + * Expand to the max in easy steps, and keep expanding it until + * we have enough for the requested fd array size. + */ + do { +#if NR_OPEN_DEFAULT < 256 + if (nfds < 256) + nfds = 256; + else +#endif + if (nfds < (PAGE_SIZE / sizeof(struct file *))) + nfds = PAGE_SIZE / sizeof(struct file *); + else { + nfds = nfds * 2; + if (nfds > NR_OPEN) + nfds = NR_OPEN; + } + } while (nfds <= nr); + new_fds = alloc_fd_array(nfds); + if (!new_fds) goto out; + fdt->fd = new_fds; + fdt->max_fds = nfds; + fdt->free_files = NULL; + return fdt; +out: + if (new_openset) + free_fdset(new_openset, nfds); + if (new_execset) + free_fdset(new_execset, nfds); + kfree(fdt); + return NULL; +} - error = 0; - - /* Copy the existing tables and install the new pointers */ +/* + * Expands the file descriptor table - it will allocate a new fdtable and + * both fd array and fdset. It is expected to be called with the + * files_lock held. + */ +static int expand_fdtable(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) +{ + int error = 0; + struct fdtable *fdt; + struct fdtable *nfdt = NULL; + + spin_unlock(&files->file_lock); + nfdt = alloc_fdtable(nr); + if (!nfdt) { + error = -ENOMEM; + spin_lock(&files->file_lock); + goto out; + } + + spin_lock(&files->file_lock); fdt = files_fdtable(files); - if (nfds > fdt->max_fdset) { - int i = fdt->max_fdset / (sizeof(unsigned long) * 8); - int count = (nfds - fdt->max_fdset) / 8; - - /* - * Don't copy the entire array if the current fdset is - * not yet initialised. - */ - if (i) { - memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8); - memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8); - memset (&new_openset->fds_bits[i], 0, count); - memset (&new_execset->fds_bits[i], 0, count); - } - - nfds = xchg(&fdt->max_fdset, nfds); - new_openset = xchg(&fdt->open_fds, new_openset); - new_execset = xchg(&fdt->close_on_exec, new_execset); + /* + * Check again since another task may have expanded the + * fd table while we dropped the lock + */ + if (nr >= fdt->max_fds || nr >= fdt->max_fdset) { + copy_fdtable(nfdt, fdt); + } else { + /* Somebody expanded while we dropped file_lock */ spin_unlock(&files->file_lock); - free_fdset (new_openset, nfds); - free_fdset (new_execset, nfds); + __free_fdtable(nfdt); spin_lock(&files->file_lock); - return 0; - } - /* Somebody expanded the array while we slept ... */ - + goto out; + } + rcu_assign_pointer(files->fdt, nfdt); + free_fdtable(fdt); out: - spin_unlock(&files->file_lock); - if (new_openset) - free_fdset(new_openset, nfds); - if (new_execset) - free_fdset(new_execset, nfds); - spin_lock(&files->file_lock); return error; } @@ -246,17 +354,36 @@ int expand_files(struct files_struct *files, int nr) struct fdtable *fdt; fdt = files_fdtable(files); - if (nr >= fdt->max_fdset) { - expand = 1; - if ((err = expand_fdset(files, nr))) + if (nr >= fdt->max_fdset || nr >= fdt->max_fds) { + if (fdt->max_fdset >= NR_OPEN || + fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) { + err = -EMFILE; goto out; - } - if (nr >= fdt->max_fds) { + } expand = 1; - if ((err = expand_fd_array(files, nr))) + if ((err = expand_fdtable(files, nr))) goto out; } err = expand; out: return err; } + +static void __devinit fdtable_defer_list_init(int cpu) +{ + struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); + spin_lock_init(&fddef->lock); + INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef); + init_timer(&fddef->timer); + fddef->timer.data = (unsigned long)fddef; + fddef->timer.function = fdtable_timer; + fddef->next = NULL; +} + +void __init files_defer_init(void) +{ + int i; + /* Really early - can't use for_each_cpu */ + for (i = 0; i < NR_CPUS; i++) + fdtable_defer_list_init(i); +} -- cgit v1.2.2