aboutsummaryrefslogtreecommitdiffstats
path: root/fs/file.c
diff options
context:
space:
mode:
authorDipankar Sarma <dipankar@in.ibm.com>2005-09-09 16:04:13 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 16:57:55 -0400
commitab2af1f5005069321c5d130f09cce577b03f43ef (patch)
tree73a70ba486f522cd9eeeef376ede2b5a1c1b473b /fs/file.c
parent6e72ad2c581de121cc7e772469e2a8f6b1fd4379 (diff)
[PATCH] files: files struct with RCU
Patch to eliminate struct files_struct.file_lock spinlock on the reader side and use rcu refcounting rcuref_xxx api for the f_count refcounter. The updates to the fdtable are done by allocating a new fdtable structure and setting files->fdt to point to the new structure. The fdtable structure is protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A global list of fdtable to be freed is not scalable, so we use a per-cpu list. If keventd is already handling the current cpu's work, we use a timer to defer queueing of that work. Since the last publication, this patch has been re-written to avoid using explicit memory barriers and use rcu_assign_pointer(), rcu_dereference() premitives instead. This required that the fd information is kept in a separate structure (fdtable) and updated atomically. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs/file.c')
-rw-r--r--fs/file.c389
1 files changed, 258 insertions, 131 deletions
diff --git a/fs/file.c b/fs/file.c
index f5926ce73f37..2127a7b9dc3a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -13,6 +13,25 @@
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/interrupt.h>
17#include <linux/spinlock.h>
18#include <linux/rcupdate.h>
19#include <linux/workqueue.h>
20
21struct fdtable_defer {
22 spinlock_t lock;
23 struct work_struct wq;
24 struct timer_list timer;
25 struct fdtable *next;
26};
27
28/*
29 * We use this list to defer free fdtables that have vmalloced
30 * sets/arrays. By keeping a per-cpu list, we avoid having to embed
31 * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
32 * this per-task structure.
33 */
34static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
16 35
17 36
18/* 37/*
@@ -48,85 +67,143 @@ void free_fd_array(struct file **array, int num)
48 vfree(array); 67 vfree(array);
49} 68}
50 69
51/* 70static void __free_fdtable(struct fdtable *fdt)
52 * Expand the fd array in the files_struct. Called with the files
53 * spinlock held for write.
54 */
55
56static int expand_fd_array(struct files_struct *files, int nr)
57 __releases(files->file_lock)
58 __acquires(files->file_lock)
59{ 71{
60 struct file **new_fds; 72 int fdset_size, fdarray_size;
61 int error, nfds;
62 struct fdtable *fdt;
63 73
64 74 fdset_size = fdt->max_fdset / 8;
65 error = -EMFILE; 75 fdarray_size = fdt->max_fds * sizeof(struct file *);
66 fdt = files_fdtable(files); 76 free_fdset(fdt->open_fds, fdset_size);
67 if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) 77 free_fdset(fdt->close_on_exec, fdset_size);
68 goto out; 78 free_fd_array(fdt->fd, fdarray_size);
79 kfree(fdt);
80}
69 81
70 nfds = fdt->max_fds; 82static void fdtable_timer(unsigned long data)
71 spin_unlock(&files->file_lock); 83{
84 struct fdtable_defer *fddef = (struct fdtable_defer *)data;
72 85
73 /* 86 spin_lock(&fddef->lock);
74 * Expand to the max in easy steps, and keep expanding it until 87 /*
75 * we have enough for the requested fd array size. 88 * If someone already emptied the queue return.
76 */ 89 */
90 if (!fddef->next)
91 goto out;
92 if (!schedule_work(&fddef->wq))
93 mod_timer(&fddef->timer, 5);
94out:
95 spin_unlock(&fddef->lock);
96}
77 97
78 do { 98static void free_fdtable_work(struct fdtable_defer *f)
79#if NR_OPEN_DEFAULT < 256 99{
80 if (nfds < 256) 100 struct fdtable *fdt;
81 nfds = 256;
82 else
83#endif
84 if (nfds < (PAGE_SIZE / sizeof(struct file *)))
85 nfds = PAGE_SIZE / sizeof(struct file *);
86 else {
87 nfds = nfds * 2;
88 if (nfds > NR_OPEN)
89 nfds = NR_OPEN;
90 }
91 } while (nfds <= nr);
92 101
93 error = -ENOMEM; 102 spin_lock_bh(&f->lock);
94 new_fds = alloc_fd_array(nfds); 103 fdt = f->next;
95 spin_lock(&files->file_lock); 104 f->next = NULL;
96 if (!new_fds) 105 spin_unlock_bh(&f->lock);
97 goto out; 106 while(fdt) {
107 struct fdtable *next = fdt->next;
108 __free_fdtable(fdt);
109 fdt = next;
110 }
111}
98 112
99 /* Copy the existing array and install the new pointer */ 113static void free_fdtable_rcu(struct rcu_head *rcu)
100 fdt = files_fdtable(files); 114{
115 struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
116 int fdset_size, fdarray_size;
117 struct fdtable_defer *fddef;
101 118
102 if (nfds > fdt->max_fds) { 119 BUG_ON(!fdt);
103 struct file **old_fds; 120 fdset_size = fdt->max_fdset / 8;
104 int i; 121 fdarray_size = fdt->max_fds * sizeof(struct file *);
105 122
106 old_fds = xchg(&fdt->fd, new_fds); 123 if (fdt->free_files) {
107 i = xchg(&fdt->max_fds, nfds); 124 /*
108 125 * The this fdtable was embedded in the files structure
109 /* Don't copy/clear the array if we are creating a new 126 * and the files structure itself was getting destroyed.
110 fd array for fork() */ 127 * It is now safe to free the files structure.
111 if (i) { 128 */
112 memcpy(new_fds, old_fds, i * sizeof(struct file *)); 129 kmem_cache_free(files_cachep, fdt->free_files);
113 /* clear the remainder of the array */ 130 return;
114 memset(&new_fds[i], 0, 131 }
115 (nfds-i) * sizeof(struct file *)); 132 if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
116 133 /*
117 spin_unlock(&files->file_lock); 134 * The fdtable was embedded
118 free_fd_array(old_fds, i); 135 */
119 spin_lock(&files->file_lock); 136 return;
120 } 137 }
138 if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
139 kfree(fdt->open_fds);
140 kfree(fdt->close_on_exec);
141 kfree(fdt->fd);
142 kfree(fdt);
121 } else { 143 } else {
122 /* Somebody expanded the array while we slept ... */ 144 fddef = &get_cpu_var(fdtable_defer_list);
123 spin_unlock(&files->file_lock); 145 spin_lock(&fddef->lock);
124 free_fd_array(new_fds, nfds); 146 fdt->next = fddef->next;
125 spin_lock(&files->file_lock); 147 fddef->next = fdt;
148 /*
149 * vmallocs are handled from the workqueue context.
150 * If the per-cpu workqueue is running, then we
151 * defer work scheduling through a timer.
152 */
153 if (!schedule_work(&fddef->wq))
154 mod_timer(&fddef->timer, 5);
155 spin_unlock(&fddef->lock);
156 put_cpu_var(fdtable_defer_list);
126 } 157 }
127 error = 0; 158}
128out: 159
129 return error; 160void free_fdtable(struct fdtable *fdt)
161{
162 if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
163 fdt->max_fds > NR_OPEN_DEFAULT)
164 call_rcu(&fdt->rcu, free_fdtable_rcu);
165}
166
167/*
168 * Expand the fdset in the files_struct. Called with the files spinlock
169 * held for write.
170 */
171static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
172{
173 int i;
174 int count;
175
176 BUG_ON(nfdt->max_fdset < fdt->max_fdset);
177 BUG_ON(nfdt->max_fds < fdt->max_fds);
178 /* Copy the existing tables and install the new pointers */
179
180 i = fdt->max_fdset / (sizeof(unsigned long) * 8);
181 count = (nfdt->max_fdset - fdt->max_fdset) / 8;
182
183 /*
184 * Don't copy the entire array if the current fdset is
185 * not yet initialised.
186 */
187 if (i) {
188 memcpy (nfdt->open_fds, fdt->open_fds,
189 fdt->max_fdset/8);
190 memcpy (nfdt->close_on_exec, fdt->close_on_exec,
191 fdt->max_fdset/8);
192 memset (&nfdt->open_fds->fds_bits[i], 0, count);
193 memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
194 }
195
196 /* Don't copy/clear the array if we are creating a new
197 fd array for fork() */
198 if (fdt->max_fds) {
199 memcpy(nfdt->fd, fdt->fd,
200 fdt->max_fds * sizeof(struct file *));
201 /* clear the remainder of the array */
202 memset(&nfdt->fd[fdt->max_fds], 0,
203 (nfdt->max_fds - fdt->max_fds) *
204 sizeof(struct file *));
205 }
206 nfdt->next_fd = fdt->next_fd;
130} 207}
131 208
132/* 209/*
@@ -157,28 +234,21 @@ void free_fdset(fd_set *array, int num)
157 vfree(array); 234 vfree(array);
158} 235}
159 236
160/* 237static struct fdtable *alloc_fdtable(int nr)
161 * Expand the fdset in the files_struct. Called with the files spinlock
162 * held for write.
163 */
164static int expand_fdset(struct files_struct *files, int nr)
165 __releases(file->file_lock)
166 __acquires(file->file_lock)
167{ 238{
168 fd_set *new_openset = NULL, *new_execset = NULL; 239 struct fdtable *fdt = NULL;
169 int error, nfds = 0; 240 int nfds = 0;
170 struct fdtable *fdt; 241 fd_set *new_openset = NULL, *new_execset = NULL;
171 242 struct file **new_fds;
172 error = -EMFILE;
173 fdt = files_fdtable(files);
174 if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN)
175 goto out;
176 243
177 nfds = fdt->max_fdset; 244 fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
178 spin_unlock(&files->file_lock); 245 if (!fdt)
246 goto out;
247 memset(fdt, 0, sizeof(*fdt));
179 248
180 /* Expand to the max in easy steps */ 249 nfds = __FD_SETSIZE;
181 do { 250 /* Expand to the max in easy steps */
251 do {
182 if (nfds < (PAGE_SIZE * 8)) 252 if (nfds < (PAGE_SIZE * 8))
183 nfds = PAGE_SIZE * 8; 253 nfds = PAGE_SIZE * 8;
184 else { 254 else {
@@ -188,50 +258,88 @@ static int expand_fdset(struct files_struct *files, int nr)
188 } 258 }
189 } while (nfds <= nr); 259 } while (nfds <= nr);
190 260
191 error = -ENOMEM; 261 new_openset = alloc_fdset(nfds);
192 new_openset = alloc_fdset(nfds); 262 new_execset = alloc_fdset(nfds);
193 new_execset = alloc_fdset(nfds); 263 if (!new_openset || !new_execset)
194 spin_lock(&files->file_lock); 264 goto out;
195 if (!new_openset || !new_execset) 265 fdt->open_fds = new_openset;
266 fdt->close_on_exec = new_execset;
267 fdt->max_fdset = nfds;
268
269 nfds = NR_OPEN_DEFAULT;
270 /*
271 * Expand to the max in easy steps, and keep expanding it until
272 * we have enough for the requested fd array size.
273 */
274 do {
275#if NR_OPEN_DEFAULT < 256
276 if (nfds < 256)
277 nfds = 256;
278 else
279#endif
280 if (nfds < (PAGE_SIZE / sizeof(struct file *)))
281 nfds = PAGE_SIZE / sizeof(struct file *);
282 else {
283 nfds = nfds * 2;
284 if (nfds > NR_OPEN)
285 nfds = NR_OPEN;
286 }
287 } while (nfds <= nr);
288 new_fds = alloc_fd_array(nfds);
289 if (!new_fds)
196 goto out; 290 goto out;
291 fdt->fd = new_fds;
292 fdt->max_fds = nfds;
293 fdt->free_files = NULL;
294 return fdt;
295out:
296 if (new_openset)
297 free_fdset(new_openset, nfds);
298 if (new_execset)
299 free_fdset(new_execset, nfds);
300 kfree(fdt);
301 return NULL;
302}
197 303
198 error = 0; 304/*
199 305 * Expands the file descriptor table - it will allocate a new fdtable and
200 /* Copy the existing tables and install the new pointers */ 306 * both fd array and fdset. It is expected to be called with the
307 * files_lock held.
308 */
309static int expand_fdtable(struct files_struct *files, int nr)
310 __releases(files->file_lock)
311 __acquires(files->file_lock)
312{
313 int error = 0;
314 struct fdtable *fdt;
315 struct fdtable *nfdt = NULL;
316
317 spin_unlock(&files->file_lock);
318 nfdt = alloc_fdtable(nr);
319 if (!nfdt) {
320 error = -ENOMEM;
321 spin_lock(&files->file_lock);
322 goto out;
323 }
324
325 spin_lock(&files->file_lock);
201 fdt = files_fdtable(files); 326 fdt = files_fdtable(files);
202 if (nfds > fdt->max_fdset) { 327 /*
203 int i = fdt->max_fdset / (sizeof(unsigned long) * 8); 328 * Check again since another task may have expanded the
204 int count = (nfds - fdt->max_fdset) / 8; 329 * fd table while we dropped the lock
205 330 */
206 /* 331 if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
207 * Don't copy the entire array if the current fdset is 332 copy_fdtable(nfdt, fdt);
208 * not yet initialised. 333 } else {
209 */ 334 /* Somebody expanded while we dropped file_lock */
210 if (i) {
211 memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8);
212 memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8);
213 memset (&new_openset->fds_bits[i], 0, count);
214 memset (&new_execset->fds_bits[i], 0, count);
215 }
216
217 nfds = xchg(&fdt->max_fdset, nfds);
218 new_openset = xchg(&fdt->open_fds, new_openset);
219 new_execset = xchg(&fdt->close_on_exec, new_execset);
220 spin_unlock(&files->file_lock); 335 spin_unlock(&files->file_lock);
221 free_fdset (new_openset, nfds); 336 __free_fdtable(nfdt);
222 free_fdset (new_execset, nfds);
223 spin_lock(&files->file_lock); 337 spin_lock(&files->file_lock);
224 return 0; 338 goto out;
225 } 339 }
226 /* Somebody expanded the array while we slept ... */ 340 rcu_assign_pointer(files->fdt, nfdt);
227 341 free_fdtable(fdt);
228out: 342out:
229 spin_unlock(&files->file_lock);
230 if (new_openset)
231 free_fdset(new_openset, nfds);
232 if (new_execset)
233 free_fdset(new_execset, nfds);
234 spin_lock(&files->file_lock);
235 return error; 343 return error;
236} 344}
237 345
@@ -246,17 +354,36 @@ int expand_files(struct files_struct *files, int nr)
246 struct fdtable *fdt; 354 struct fdtable *fdt;
247 355
248 fdt = files_fdtable(files); 356 fdt = files_fdtable(files);
249 if (nr >= fdt->max_fdset) { 357 if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
250 expand = 1; 358 if (fdt->max_fdset >= NR_OPEN ||
251 if ((err = expand_fdset(files, nr))) 359 fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
360 err = -EMFILE;
252 goto out; 361 goto out;
253 } 362 }
254 if (nr >= fdt->max_fds) {
255 expand = 1; 363 expand = 1;
256 if ((err = expand_fd_array(files, nr))) 364 if ((err = expand_fdtable(files, nr)))
257 goto out; 365 goto out;
258 } 366 }
259 err = expand; 367 err = expand;
260out: 368out:
261 return err; 369 return err;
262} 370}
371
372static void __devinit fdtable_defer_list_init(int cpu)
373{
374 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
375 spin_lock_init(&fddef->lock);
376 INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
377 init_timer(&fddef->timer);
378 fddef->timer.data = (unsigned long)fddef;
379 fddef->timer.function = fdtable_timer;
380 fddef->next = NULL;
381}
382
383void __init files_defer_init(void)
384{
385 int i;
386 /* Really early - can't use for_each_cpu */
387 for (i = 0; i < NR_CPUS; i++)
388 fdtable_defer_list_init(i);
389}