aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDipankar Sarma <dipankar@in.ibm.com>2005-09-09 16:04:13 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 16:57:55 -0400
commitab2af1f5005069321c5d130f09cce577b03f43ef (patch)
tree73a70ba486f522cd9eeeef376ede2b5a1c1b473b
parent6e72ad2c581de121cc7e772469e2a8f6b1fd4379 (diff)
[PATCH] files: files struct with RCU
Patch to eliminate struct files_struct.file_lock spinlock on the reader side and use rcu refcounting rcuref_xxx api for the f_count refcounter. The updates to the fdtable are done by allocating a new fdtable structure and setting files->fdt to point to the new structure. The fdtable structure is protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A global list of fdtable to be freed is not scalable, so we use a per-cpu list. If keventd is already handling the current cpu's work, we use a timer to defer queueing of that work. Since the last publication, this patch has been re-written to avoid using explicit memory barriers and use rcu_assign_pointer(), rcu_dereference() premitives instead. This required that the fd information is kept in a separate structure (fdtable) and updated atomically. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/aio.c3
-rw-r--r--fs/fcntl.c13
-rw-r--r--fs/file.c389
-rw-r--r--fs/file_table.c40
-rw-r--r--fs/open.c8
-rw-r--r--include/linux/file.h11
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/init_task.h5
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c23
10 files changed, 345 insertions, 166 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 201c1847fa07..38f62680fd63 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,7 @@
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/workqueue.h> 30#include <linux/workqueue.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/rcuref.h>
32 33
33#include <asm/kmap_types.h> 34#include <asm/kmap_types.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -499,7 +500,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
499 /* Must be done under the lock to serialise against cancellation. 500 /* Must be done under the lock to serialise against cancellation.
500 * Call this aio_fput as it duplicates fput via the fput_work. 501 * Call this aio_fput as it duplicates fput via the fput_work.
501 */ 502 */
502 if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { 503 if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
503 get_ioctx(ctx); 504 get_ioctx(ctx);
504 spin_lock(&fput_lock); 505 spin_lock(&fput_lock);
505 list_add(&req->ki_list, &fput_head); 506 list_add(&req->ki_list, &fput_head);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfecc6238083..d2f3ed8acd93 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -16,6 +16,7 @@
16#include <linux/security.h> 16#include <linux/security.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/signal.h> 18#include <linux/signal.h>
19#include <linux/rcupdate.h>
19 20
20#include <asm/poll.h> 21#include <asm/poll.h>
21#include <asm/siginfo.h> 22#include <asm/siginfo.h>
@@ -64,8 +65,8 @@ static int locate_fd(struct files_struct *files,
64 if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 65 if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
65 goto out; 66 goto out;
66 67
67 fdt = files_fdtable(files);
68repeat: 68repeat:
69 fdt = files_fdtable(files);
69 /* 70 /*
70 * Someone might have closed fd's in the range 71 * Someone might have closed fd's in the range
71 * orig_start..fdt->next_fd 72 * orig_start..fdt->next_fd
@@ -95,9 +96,15 @@ repeat:
95 if (error) 96 if (error)
96 goto repeat; 97 goto repeat;
97 98
99 /*
100 * We reacquired files_lock, so we are safe as long as
101 * we reacquire the fdtable pointer and use it while holding
102 * the lock, no one can free it during that time.
103 */
104 fdt = files_fdtable(files);
98 if (start <= fdt->next_fd) 105 if (start <= fdt->next_fd)
99 fdt->next_fd = newfd + 1; 106 fdt->next_fd = newfd + 1;
100 107
101 error = newfd; 108 error = newfd;
102 109
103out: 110out:
@@ -163,7 +170,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
163 if (!tofree && FD_ISSET(newfd, fdt->open_fds)) 170 if (!tofree && FD_ISSET(newfd, fdt->open_fds))
164 goto out_fput; 171 goto out_fput;
165 172
166 fdt->fd[newfd] = file; 173 rcu_assign_pointer(fdt->fd[newfd], file);
167 FD_SET(newfd, fdt->open_fds); 174 FD_SET(newfd, fdt->open_fds);
168 FD_CLR(newfd, fdt->close_on_exec); 175 FD_CLR(newfd, fdt->close_on_exec);
169 spin_unlock(&files->file_lock); 176 spin_unlock(&files->file_lock);
diff --git a/fs/file.c b/fs/file.c
index f5926ce73f37..2127a7b9dc3a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -13,6 +13,25 @@
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/interrupt.h>
17#include <linux/spinlock.h>
18#include <linux/rcupdate.h>
19#include <linux/workqueue.h>
20
21struct fdtable_defer {
22 spinlock_t lock;
23 struct work_struct wq;
24 struct timer_list timer;
25 struct fdtable *next;
26};
27
28/*
29 * We use this list to defer free fdtables that have vmalloced
30 * sets/arrays. By keeping a per-cpu list, we avoid having to embed
31 * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
32 * this per-task structure.
33 */
34static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
16 35
17 36
18/* 37/*
@@ -48,85 +67,143 @@ void free_fd_array(struct file **array, int num)
48 vfree(array); 67 vfree(array);
49} 68}
50 69
51/* 70static void __free_fdtable(struct fdtable *fdt)
52 * Expand the fd array in the files_struct. Called with the files
53 * spinlock held for write.
54 */
55
56static int expand_fd_array(struct files_struct *files, int nr)
57 __releases(files->file_lock)
58 __acquires(files->file_lock)
59{ 71{
60 struct file **new_fds; 72 int fdset_size, fdarray_size;
61 int error, nfds;
62 struct fdtable *fdt;
63 73
64 74 fdset_size = fdt->max_fdset / 8;
65 error = -EMFILE; 75 fdarray_size = fdt->max_fds * sizeof(struct file *);
66 fdt = files_fdtable(files); 76 free_fdset(fdt->open_fds, fdset_size);
67 if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) 77 free_fdset(fdt->close_on_exec, fdset_size);
68 goto out; 78 free_fd_array(fdt->fd, fdarray_size);
79 kfree(fdt);
80}
69 81
70 nfds = fdt->max_fds; 82static void fdtable_timer(unsigned long data)
71 spin_unlock(&files->file_lock); 83{
84 struct fdtable_defer *fddef = (struct fdtable_defer *)data;
72 85
73 /* 86 spin_lock(&fddef->lock);
74 * Expand to the max in easy steps, and keep expanding it until 87 /*
75 * we have enough for the requested fd array size. 88 * If someone already emptied the queue return.
76 */ 89 */
90 if (!fddef->next)
91 goto out;
92 if (!schedule_work(&fddef->wq))
93 mod_timer(&fddef->timer, 5);
94out:
95 spin_unlock(&fddef->lock);
96}
77 97
78 do { 98static void free_fdtable_work(struct fdtable_defer *f)
79#if NR_OPEN_DEFAULT < 256 99{
80 if (nfds < 256) 100 struct fdtable *fdt;
81 nfds = 256;
82 else
83#endif
84 if (nfds < (PAGE_SIZE / sizeof(struct file *)))
85 nfds = PAGE_SIZE / sizeof(struct file *);
86 else {
87 nfds = nfds * 2;
88 if (nfds > NR_OPEN)
89 nfds = NR_OPEN;
90 }
91 } while (nfds <= nr);
92 101
93 error = -ENOMEM; 102 spin_lock_bh(&f->lock);
94 new_fds = alloc_fd_array(nfds); 103 fdt = f->next;
95 spin_lock(&files->file_lock); 104 f->next = NULL;
96 if (!new_fds) 105 spin_unlock_bh(&f->lock);
97 goto out; 106 while(fdt) {
107 struct fdtable *next = fdt->next;
108 __free_fdtable(fdt);
109 fdt = next;
110 }
111}
98 112
99 /* Copy the existing array and install the new pointer */ 113static void free_fdtable_rcu(struct rcu_head *rcu)
100 fdt = files_fdtable(files); 114{
115 struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
116 int fdset_size, fdarray_size;
117 struct fdtable_defer *fddef;
101 118
102 if (nfds > fdt->max_fds) { 119 BUG_ON(!fdt);
103 struct file **old_fds; 120 fdset_size = fdt->max_fdset / 8;
104 int i; 121 fdarray_size = fdt->max_fds * sizeof(struct file *);
105 122
106 old_fds = xchg(&fdt->fd, new_fds); 123 if (fdt->free_files) {
107 i = xchg(&fdt->max_fds, nfds); 124 /*
108 125 * The this fdtable was embedded in the files structure
109 /* Don't copy/clear the array if we are creating a new 126 * and the files structure itself was getting destroyed.
110 fd array for fork() */ 127 * It is now safe to free the files structure.
111 if (i) { 128 */
112 memcpy(new_fds, old_fds, i * sizeof(struct file *)); 129 kmem_cache_free(files_cachep, fdt->free_files);
113 /* clear the remainder of the array */ 130 return;
114 memset(&new_fds[i], 0, 131 }
115 (nfds-i) * sizeof(struct file *)); 132 if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
116 133 /*
117 spin_unlock(&files->file_lock); 134 * The fdtable was embedded
118 free_fd_array(old_fds, i); 135 */
119 spin_lock(&files->file_lock); 136 return;
120 } 137 }
138 if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
139 kfree(fdt->open_fds);
140 kfree(fdt->close_on_exec);
141 kfree(fdt->fd);
142 kfree(fdt);
121 } else { 143 } else {
122 /* Somebody expanded the array while we slept ... */ 144 fddef = &get_cpu_var(fdtable_defer_list);
123 spin_unlock(&files->file_lock); 145 spin_lock(&fddef->lock);
124 free_fd_array(new_fds, nfds); 146 fdt->next = fddef->next;
125 spin_lock(&files->file_lock); 147 fddef->next = fdt;
148 /*
149 * vmallocs are handled from the workqueue context.
150 * If the per-cpu workqueue is running, then we
151 * defer work scheduling through a timer.
152 */
153 if (!schedule_work(&fddef->wq))
154 mod_timer(&fddef->timer, 5);
155 spin_unlock(&fddef->lock);
156 put_cpu_var(fdtable_defer_list);
126 } 157 }
127 error = 0; 158}
128out: 159
129 return error; 160void free_fdtable(struct fdtable *fdt)
161{
162 if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
163 fdt->max_fds > NR_OPEN_DEFAULT)
164 call_rcu(&fdt->rcu, free_fdtable_rcu);
165}
166
167/*
168 * Expand the fdset in the files_struct. Called with the files spinlock
169 * held for write.
170 */
171static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
172{
173 int i;
174 int count;
175
176 BUG_ON(nfdt->max_fdset < fdt->max_fdset);
177 BUG_ON(nfdt->max_fds < fdt->max_fds);
178 /* Copy the existing tables and install the new pointers */
179
180 i = fdt->max_fdset / (sizeof(unsigned long) * 8);
181 count = (nfdt->max_fdset - fdt->max_fdset) / 8;
182
183 /*
184 * Don't copy the entire array if the current fdset is
185 * not yet initialised.
186 */
187 if (i) {
188 memcpy (nfdt->open_fds, fdt->open_fds,
189 fdt->max_fdset/8);
190 memcpy (nfdt->close_on_exec, fdt->close_on_exec,
191 fdt->max_fdset/8);
192 memset (&nfdt->open_fds->fds_bits[i], 0, count);
193 memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
194 }
195
196 /* Don't copy/clear the array if we are creating a new
197 fd array for fork() */
198 if (fdt->max_fds) {
199 memcpy(nfdt->fd, fdt->fd,
200 fdt->max_fds * sizeof(struct file *));
201 /* clear the remainder of the array */
202 memset(&nfdt->fd[fdt->max_fds], 0,
203 (nfdt->max_fds - fdt->max_fds) *
204 sizeof(struct file *));
205 }
206 nfdt->next_fd = fdt->next_fd;
130} 207}
131 208
132/* 209/*
@@ -157,28 +234,21 @@ void free_fdset(fd_set *array, int num)
157 vfree(array); 234 vfree(array);
158} 235}
159 236
160/* 237static struct fdtable *alloc_fdtable(int nr)
161 * Expand the fdset in the files_struct. Called with the files spinlock
162 * held for write.
163 */
164static int expand_fdset(struct files_struct *files, int nr)
165 __releases(file->file_lock)
166 __acquires(file->file_lock)
167{ 238{
168 fd_set *new_openset = NULL, *new_execset = NULL; 239 struct fdtable *fdt = NULL;
169 int error, nfds = 0; 240 int nfds = 0;
170 struct fdtable *fdt; 241 fd_set *new_openset = NULL, *new_execset = NULL;
171 242 struct file **new_fds;
172 error = -EMFILE;
173 fdt = files_fdtable(files);
174 if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN)
175 goto out;
176 243
177 nfds = fdt->max_fdset; 244 fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
178 spin_unlock(&files->file_lock); 245 if (!fdt)
246 goto out;
247 memset(fdt, 0, sizeof(*fdt));
179 248
180 /* Expand to the max in easy steps */ 249 nfds = __FD_SETSIZE;
181 do { 250 /* Expand to the max in easy steps */
251 do {
182 if (nfds < (PAGE_SIZE * 8)) 252 if (nfds < (PAGE_SIZE * 8))
183 nfds = PAGE_SIZE * 8; 253 nfds = PAGE_SIZE * 8;
184 else { 254 else {
@@ -188,50 +258,88 @@ static int expand_fdset(struct files_struct *files, int nr)
188 } 258 }
189 } while (nfds <= nr); 259 } while (nfds <= nr);
190 260
191 error = -ENOMEM; 261 new_openset = alloc_fdset(nfds);
192 new_openset = alloc_fdset(nfds); 262 new_execset = alloc_fdset(nfds);
193 new_execset = alloc_fdset(nfds); 263 if (!new_openset || !new_execset)
194 spin_lock(&files->file_lock); 264 goto out;
195 if (!new_openset || !new_execset) 265 fdt->open_fds = new_openset;
266 fdt->close_on_exec = new_execset;
267 fdt->max_fdset = nfds;
268
269 nfds = NR_OPEN_DEFAULT;
270 /*
271 * Expand to the max in easy steps, and keep expanding it until
272 * we have enough for the requested fd array size.
273 */
274 do {
275#if NR_OPEN_DEFAULT < 256
276 if (nfds < 256)
277 nfds = 256;
278 else
279#endif
280 if (nfds < (PAGE_SIZE / sizeof(struct file *)))
281 nfds = PAGE_SIZE / sizeof(struct file *);
282 else {
283 nfds = nfds * 2;
284 if (nfds > NR_OPEN)
285 nfds = NR_OPEN;
286 }
287 } while (nfds <= nr);
288 new_fds = alloc_fd_array(nfds);
289 if (!new_fds)
196 goto out; 290 goto out;
291 fdt->fd = new_fds;
292 fdt->max_fds = nfds;
293 fdt->free_files = NULL;
294 return fdt;
295out:
296 if (new_openset)
297 free_fdset(new_openset, nfds);
298 if (new_execset)
299 free_fdset(new_execset, nfds);
300 kfree(fdt);
301 return NULL;
302}
197 303
198 error = 0; 304/*
199 305 * Expands the file descriptor table - it will allocate a new fdtable and
200 /* Copy the existing tables and install the new pointers */ 306 * both fd array and fdset. It is expected to be called with the
307 * files_lock held.
308 */
309static int expand_fdtable(struct files_struct *files, int nr)
310 __releases(files->file_lock)
311 __acquires(files->file_lock)
312{
313 int error = 0;
314 struct fdtable *fdt;
315 struct fdtable *nfdt = NULL;
316
317 spin_unlock(&files->file_lock);
318 nfdt = alloc_fdtable(nr);
319 if (!nfdt) {
320 error = -ENOMEM;
321 spin_lock(&files->file_lock);
322 goto out;
323 }
324
325 spin_lock(&files->file_lock);
201 fdt = files_fdtable(files); 326 fdt = files_fdtable(files);
202 if (nfds > fdt->max_fdset) { 327 /*
203 int i = fdt->max_fdset / (sizeof(unsigned long) * 8); 328 * Check again since another task may have expanded the
204 int count = (nfds - fdt->max_fdset) / 8; 329 * fd table while we dropped the lock
205 330 */
206 /* 331 if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
207 * Don't copy the entire array if the current fdset is 332 copy_fdtable(nfdt, fdt);
208 * not yet initialised. 333 } else {
209 */ 334 /* Somebody expanded while we dropped file_lock */
210 if (i) {
211 memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8);
212 memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8);
213 memset (&new_openset->fds_bits[i], 0, count);
214 memset (&new_execset->fds_bits[i], 0, count);
215 }
216
217 nfds = xchg(&fdt->max_fdset, nfds);
218 new_openset = xchg(&fdt->open_fds, new_openset);
219 new_execset = xchg(&fdt->close_on_exec, new_execset);
220 spin_unlock(&files->file_lock); 335 spin_unlock(&files->file_lock);
221 free_fdset (new_openset, nfds); 336 __free_fdtable(nfdt);
222 free_fdset (new_execset, nfds);
223 spin_lock(&files->file_lock); 337 spin_lock(&files->file_lock);
224 return 0; 338 goto out;
225 } 339 }
226 /* Somebody expanded the array while we slept ... */ 340 rcu_assign_pointer(files->fdt, nfdt);
227 341 free_fdtable(fdt);
228out: 342out:
229 spin_unlock(&files->file_lock);
230 if (new_openset)
231 free_fdset(new_openset, nfds);
232 if (new_execset)
233 free_fdset(new_execset, nfds);
234 spin_lock(&files->file_lock);
235 return error; 343 return error;
236} 344}
237 345
@@ -246,17 +354,36 @@ int expand_files(struct files_struct *files, int nr)
246 struct fdtable *fdt; 354 struct fdtable *fdt;
247 355
248 fdt = files_fdtable(files); 356 fdt = files_fdtable(files);
249 if (nr >= fdt->max_fdset) { 357 if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
250 expand = 1; 358 if (fdt->max_fdset >= NR_OPEN ||
251 if ((err = expand_fdset(files, nr))) 359 fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
360 err = -EMFILE;
252 goto out; 361 goto out;
253 } 362 }
254 if (nr >= fdt->max_fds) {
255 expand = 1; 363 expand = 1;
256 if ((err = expand_fd_array(files, nr))) 364 if ((err = expand_fdtable(files, nr)))
257 goto out; 365 goto out;
258 } 366 }
259 err = expand; 367 err = expand;
260out: 368out:
261 return err; 369 return err;
262} 370}
371
372static void __devinit fdtable_defer_list_init(int cpu)
373{
374 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
375 spin_lock_init(&fddef->lock);
376 INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
377 init_timer(&fddef->timer);
378 fddef->timer.data = (unsigned long)fddef;
379 fddef->timer.function = fdtable_timer;
380 fddef->next = NULL;
381}
382
383void __init files_defer_init(void)
384{
385 int i;
386 /* Really early - can't use for_each_cpu */
387 for (i = 0; i < NR_CPUS; i++)
388 fdtable_defer_list_init(i);
389}
diff --git a/fs/file_table.c b/fs/file_table.c
index 43e9e1737de2..86ec8ae985b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/eventpoll.h> 16#include <linux/eventpoll.h>
17#include <linux/rcupdate.h>
17#include <linux/mount.h> 18#include <linux/mount.h>
18#include <linux/cdev.h> 19#include <linux/cdev.h>
19#include <linux/fsnotify.h> 20#include <linux/fsnotify.h>
@@ -53,11 +54,17 @@ void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
53 spin_unlock_irqrestore(&filp_count_lock, flags); 54 spin_unlock_irqrestore(&filp_count_lock, flags);
54} 55}
55 56
56static inline void file_free(struct file *f) 57static inline void file_free_rcu(struct rcu_head *head)
57{ 58{
59 struct file *f = container_of(head, struct file, f_rcuhead);
58 kmem_cache_free(filp_cachep, f); 60 kmem_cache_free(filp_cachep, f);
59} 61}
60 62
63static inline void file_free(struct file *f)
64{
65 call_rcu(&f->f_rcuhead, file_free_rcu);
66}
67
61/* Find an unused file structure and return a pointer to it. 68/* Find an unused file structure and return a pointer to it.
62 * Returns NULL, if there are no more free file structures or 69 * Returns NULL, if there are no more free file structures or
63 * we run out of memory. 70 * we run out of memory.
@@ -110,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
110 117
111void fastcall fput(struct file *file) 118void fastcall fput(struct file *file)
112{ 119{
113 if (atomic_dec_and_test(&file->f_count)) 120 if (rcuref_dec_and_test(&file->f_count))
114 __fput(file); 121 __fput(file);
115} 122}
116 123
@@ -156,11 +163,17 @@ struct file fastcall *fget(unsigned int fd)
156 struct file *file; 163 struct file *file;
157 struct files_struct *files = current->files; 164 struct files_struct *files = current->files;
158 165
159 spin_lock(&files->file_lock); 166 rcu_read_lock();
160 file = fcheck_files(files, fd); 167 file = fcheck_files(files, fd);
161 if (file) 168 if (file) {
162 get_file(file); 169 if (!rcuref_inc_lf(&file->f_count)) {
163 spin_unlock(&files->file_lock); 170 /* File object ref couldn't be taken */
171 rcu_read_unlock();
172 return NULL;
173 }
174 }
175 rcu_read_unlock();
176
164 return file; 177 return file;
165} 178}
166 179
@@ -182,21 +195,25 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
182 if (likely((atomic_read(&files->count) == 1))) { 195 if (likely((atomic_read(&files->count) == 1))) {
183 file = fcheck_files(files, fd); 196 file = fcheck_files(files, fd);
184 } else { 197 } else {
185 spin_lock(&files->file_lock); 198 rcu_read_lock();
186 file = fcheck_files(files, fd); 199 file = fcheck_files(files, fd);
187 if (file) { 200 if (file) {
188 get_file(file); 201 if (rcuref_inc_lf(&file->f_count))
189 *fput_needed = 1; 202 *fput_needed = 1;
203 else
204 /* Didn't get the reference, someone's freed */
205 file = NULL;
190 } 206 }
191 spin_unlock(&files->file_lock); 207 rcu_read_unlock();
192 } 208 }
209
193 return file; 210 return file;
194} 211}
195 212
196 213
197void put_filp(struct file *file) 214void put_filp(struct file *file)
198{ 215{
199 if (atomic_dec_and_test(&file->f_count)) { 216 if (rcuref_dec_and_test(&file->f_count)) {
200 security_file_free(file); 217 security_file_free(file);
201 file_kill(file); 218 file_kill(file);
202 file_free(file); 219 file_free(file);
@@ -257,4 +274,5 @@ void __init files_init(unsigned long mempages)
257 files_stat.max_files = n; 274 files_stat.max_files = n;
258 if (files_stat.max_files < NR_FILE) 275 if (files_stat.max_files < NR_FILE)
259 files_stat.max_files = NR_FILE; 276 files_stat.max_files = NR_FILE;
277 files_defer_init();
260} 278}
diff --git a/fs/open.c b/fs/open.c
index b6542516a0ca..2fac58c51910 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -24,6 +24,7 @@
24#include <linux/personality.h> 24#include <linux/personality.h>
25#include <linux/pagemap.h> 25#include <linux/pagemap.h>
26#include <linux/syscalls.h> 26#include <linux/syscalls.h>
27#include <linux/rcupdate.h>
27 28
28#include <asm/unistd.h> 29#include <asm/unistd.h>
29 30
@@ -930,9 +931,8 @@ void fastcall fd_install(unsigned int fd, struct file * file)
930 struct fdtable *fdt; 931 struct fdtable *fdt;
931 spin_lock(&files->file_lock); 932 spin_lock(&files->file_lock);
932 fdt = files_fdtable(files); 933 fdt = files_fdtable(files);
933 if (unlikely(fdt->fd[fd] != NULL)) 934 BUG_ON(fdt->fd[fd] != NULL);
934 BUG(); 935 rcu_assign_pointer(fdt->fd[fd], file);
935 fdt->fd[fd] = file;
936 spin_unlock(&files->file_lock); 936 spin_unlock(&files->file_lock);
937} 937}
938 938
@@ -1024,7 +1024,7 @@ asmlinkage long sys_close(unsigned int fd)
1024 filp = fdt->fd[fd]; 1024 filp = fdt->fd[fd];
1025 if (!filp) 1025 if (!filp)
1026 goto out_unlock; 1026 goto out_unlock;
1027 fdt->fd[fd] = NULL; 1027 rcu_assign_pointer(fdt->fd[fd], NULL);
1028 FD_CLR(fd, fdt->close_on_exec); 1028 FD_CLR(fd, fdt->close_on_exec);
1029 __put_unused_fd(files, fd); 1029 __put_unused_fd(files, fd);
1030 spin_unlock(&files->file_lock); 1030 spin_unlock(&files->file_lock);
diff --git a/include/linux/file.h b/include/linux/file.h
index db372230848e..f5bbd4c508b3 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
9#include <linux/posix_types.h> 9#include <linux/posix_types.h>
10#include <linux/compiler.h> 10#include <linux/compiler.h>
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/rcupdate.h>
12 13
13/* 14/*
14 * The default fd array needs to be at least BITS_PER_LONG, 15 * The default fd array needs to be at least BITS_PER_LONG,
@@ -23,6 +24,9 @@ struct fdtable {
23 struct file ** fd; /* current fd array */ 24 struct file ** fd; /* current fd array */
24 fd_set *close_on_exec; 25 fd_set *close_on_exec;
25 fd_set *open_fds; 26 fd_set *open_fds;
27 struct rcu_head rcu;
28 struct files_struct *free_files;
29 struct fdtable *next;
26}; 30};
27 31
28/* 32/*
@@ -31,13 +35,14 @@ struct fdtable {
31struct files_struct { 35struct files_struct {
32 atomic_t count; 36 atomic_t count;
33 spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ 37 spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */
38 struct fdtable *fdt;
34 struct fdtable fdtab; 39 struct fdtable fdtab;
35 fd_set close_on_exec_init; 40 fd_set close_on_exec_init;
36 fd_set open_fds_init; 41 fd_set open_fds_init;
37 struct file * fd_array[NR_OPEN_DEFAULT]; 42 struct file * fd_array[NR_OPEN_DEFAULT];
38}; 43};
39 44
40#define files_fdtable(files) (&(files)->fdtab) 45#define files_fdtable(files) (rcu_dereference((files)->fdt))
41 46
42extern void FASTCALL(__fput(struct file *)); 47extern void FASTCALL(__fput(struct file *));
43extern void FASTCALL(fput(struct file *)); 48extern void FASTCALL(fput(struct file *));
@@ -65,6 +70,8 @@ extern fd_set *alloc_fdset(int);
65extern void free_fdset(fd_set *, int); 70extern void free_fdset(fd_set *, int);
66 71
67extern int expand_files(struct files_struct *, int nr); 72extern int expand_files(struct files_struct *, int nr);
73extern void free_fdtable(struct fdtable *fdt);
74extern void __init files_defer_init(void);
68 75
69static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) 76static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
70{ 77{
@@ -72,7 +79,7 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in
72 struct fdtable *fdt = files_fdtable(files); 79 struct fdtable *fdt = files_fdtable(files);
73 80
74 if (fd < fdt->max_fds) 81 if (fd < fdt->max_fds)
75 file = fdt->fd[fd]; 82 file = rcu_dereference(fdt->fd[fd]);
76 return file; 83 return file;
77} 84}
78 85
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd93ab7da905..7f61227827d7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,6 +9,7 @@
9#include <linux/config.h> 9#include <linux/config.h>
10#include <linux/limits.h> 10#include <linux/limits.h>
11#include <linux/ioctl.h> 11#include <linux/ioctl.h>
12#include <linux/rcuref.h>
12 13
13/* 14/*
14 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change 15 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -597,12 +598,13 @@ struct file {
597 spinlock_t f_ep_lock; 598 spinlock_t f_ep_lock;
598#endif /* #ifdef CONFIG_EPOLL */ 599#endif /* #ifdef CONFIG_EPOLL */
599 struct address_space *f_mapping; 600 struct address_space *f_mapping;
601 struct rcu_head f_rcuhead;
600}; 602};
601extern spinlock_t files_lock; 603extern spinlock_t files_lock;
602#define file_list_lock() spin_lock(&files_lock); 604#define file_list_lock() spin_lock(&files_lock);
603#define file_list_unlock() spin_unlock(&files_lock); 605#define file_list_unlock() spin_unlock(&files_lock);
604 606
605#define get_file(x) atomic_inc(&(x)->f_count) 607#define get_file(x) rcuref_inc(&(x)->f_count)
606#define file_count(x) atomic_read(&(x)->f_count) 608#define file_count(x) atomic_read(&(x)->f_count)
607 609
608#define MAX_NON_LFS ((1UL<<31) - 1) 610#define MAX_NON_LFS ((1UL<<31) - 1)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 94aefa54a1b5..68ab5f2ab9cd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -2,6 +2,7 @@
2#define _LINUX__INIT_TASK_H 2#define _LINUX__INIT_TASK_H
3 3
4#include <linux/file.h> 4#include <linux/file.h>
5#include <linux/rcupdate.h>
5 6
6#define INIT_FDTABLE \ 7#define INIT_FDTABLE \
7{ \ 8{ \
@@ -11,12 +12,16 @@
11 .fd = &init_files.fd_array[0], \ 12 .fd = &init_files.fd_array[0], \
12 .close_on_exec = &init_files.close_on_exec_init, \ 13 .close_on_exec = &init_files.close_on_exec_init, \
13 .open_fds = &init_files.open_fds_init, \ 14 .open_fds = &init_files.open_fds_init, \
15 .rcu = RCU_HEAD_INIT, \
16 .free_files = NULL, \
17 .next = NULL, \
14} 18}
15 19
16#define INIT_FILES \ 20#define INIT_FILES \
17{ \ 21{ \
18 .count = ATOMIC_INIT(1), \ 22 .count = ATOMIC_INIT(1), \
19 .file_lock = SPIN_LOCK_UNLOCKED, \ 23 .file_lock = SPIN_LOCK_UNLOCKED, \
24 .fdt = &init_files.fdtab, \
20 .fdtab = INIT_FDTABLE, \ 25 .fdtab = INIT_FDTABLE, \
21 .close_on_exec_init = { { 0, } }, \ 26 .close_on_exec_init = { { 0, } }, \
22 .open_fds_init = { { 0, } }, \ 27 .open_fds_init = { { 0, } }, \
diff --git a/kernel/exit.c b/kernel/exit.c
index 83beb1e93b18..6d2089a1bce7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -411,15 +411,16 @@ void fastcall put_files_struct(struct files_struct *files)
411 close_files(files); 411 close_files(files);
412 /* 412 /*
413 * Free the fd and fdset arrays if we expanded them. 413 * Free the fd and fdset arrays if we expanded them.
414 * If the fdtable was embedded, pass files for freeing
415 * at the end of the RCU grace period. Otherwise,
416 * you can free files immediately.
414 */ 417 */
415 fdt = files_fdtable(files); 418 fdt = files_fdtable(files);
416 if (fdt->fd != &files->fd_array[0]) 419 if (fdt == &files->fdtab)
417 free_fd_array(fdt->fd, fdt->max_fds); 420 fdt->free_files = files;
418 if (fdt->max_fdset > __FD_SETSIZE) { 421 else
419 free_fdset(fdt->open_fds, fdt->max_fdset); 422 kmem_cache_free(files_cachep, files);
420 free_fdset(fdt->close_on_exec, fdt->max_fdset); 423 free_fdtable(fdt);
421 }
422 kmem_cache_free(files_cachep, files);
423 } 424 }
424} 425}
425 426
diff --git a/kernel/fork.c b/kernel/fork.c
index ecc694debb50..8149f3602881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/jiffies.h> 36#include <linux/jiffies.h>
37#include <linux/futex.h> 37#include <linux/futex.h>
38#include <linux/rcupdate.h>
38#include <linux/ptrace.h> 39#include <linux/ptrace.h>
39#include <linux/mount.h> 40#include <linux/mount.h>
40#include <linux/audit.h> 41#include <linux/audit.h>
@@ -565,13 +566,12 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
565 return 0; 566 return 0;
566} 567}
567 568
568static int count_open_files(struct files_struct *files, int size) 569static int count_open_files(struct fdtable *fdt)
569{ 570{
571 int size = fdt->max_fdset;
570 int i; 572 int i;
571 struct fdtable *fdt;
572 573
573 /* Find the last open fd */ 574 /* Find the last open fd */
574 fdt = files_fdtable(files);
575 for (i = size/(8*sizeof(long)); i > 0; ) { 575 for (i = size/(8*sizeof(long)); i > 0; ) {
576 if (fdt->open_fds->fds_bits[--i]) 576 if (fdt->open_fds->fds_bits[--i])
577 break; 577 break;
@@ -592,13 +592,17 @@ static struct files_struct *alloc_files(void)
592 atomic_set(&newf->count, 1); 592 atomic_set(&newf->count, 1);
593 593
594 spin_lock_init(&newf->file_lock); 594 spin_lock_init(&newf->file_lock);
595 fdt = files_fdtable(newf); 595 fdt = &newf->fdtab;
596 fdt->next_fd = 0; 596 fdt->next_fd = 0;
597 fdt->max_fds = NR_OPEN_DEFAULT; 597 fdt->max_fds = NR_OPEN_DEFAULT;
598 fdt->max_fdset = __FD_SETSIZE; 598 fdt->max_fdset = __FD_SETSIZE;
599 fdt->close_on_exec = &newf->close_on_exec_init; 599 fdt->close_on_exec = &newf->close_on_exec_init;
600 fdt->open_fds = &newf->open_fds_init; 600 fdt->open_fds = &newf->open_fds_init;
601 fdt->fd = &newf->fd_array[0]; 601 fdt->fd = &newf->fd_array[0];
602 INIT_RCU_HEAD(&fdt->rcu);
603 fdt->free_files = NULL;
604 fdt->next = NULL;
605 rcu_assign_pointer(newf->fdt, fdt);
602out: 606out:
603 return newf; 607 return newf;
604} 608}
@@ -637,7 +641,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
637 old_fdt = files_fdtable(oldf); 641 old_fdt = files_fdtable(oldf);
638 new_fdt = files_fdtable(newf); 642 new_fdt = files_fdtable(newf);
639 size = old_fdt->max_fdset; 643 size = old_fdt->max_fdset;
640 open_files = count_open_files(oldf, old_fdt->max_fdset); 644 open_files = count_open_files(old_fdt);
641 expand = 0; 645 expand = 0;
642 646
643 /* 647 /*
@@ -661,7 +665,14 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
661 spin_unlock(&newf->file_lock); 665 spin_unlock(&newf->file_lock);
662 if (error < 0) 666 if (error < 0)
663 goto out_release; 667 goto out_release;
668 new_fdt = files_fdtable(newf);
669 /*
670 * Reacquire the oldf lock and a pointer to its fd table
671 * who knows it may have a new bigger fd table. We need
672 * the latest pointer.
673 */
664 spin_lock(&oldf->file_lock); 674 spin_lock(&oldf->file_lock);
675 old_fdt = files_fdtable(oldf);
665 } 676 }
666 677
667 old_fds = old_fdt->fd; 678 old_fds = old_fdt->fd;
@@ -683,7 +694,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
683 */ 694 */
684 FD_CLR(open_files - i, new_fdt->open_fds); 695 FD_CLR(open_files - i, new_fdt->open_fds);
685 } 696 }
686 *new_fds++ = f; 697 rcu_assign_pointer(*new_fds++, f);
687 } 698 }
688 spin_unlock(&oldf->file_lock); 699 spin_unlock(&oldf->file_lock);
689 700