diff options
Diffstat (limited to 'fs/nfs/direct.c')
-rw-r--r-- | fs/nfs/direct.c | 808 |
1 files changed, 808 insertions, 0 deletions
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c new file mode 100644 index 000000000000..68df803f27ca --- /dev/null +++ b/fs/nfs/direct.c | |||
@@ -0,0 +1,808 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/direct.c | ||
3 | * | ||
4 | * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> | ||
5 | * | ||
6 | * High-performance uncached I/O for the Linux NFS client | ||
7 | * | ||
8 | * There are important applications whose performance or correctness | ||
9 | * depends on uncached access to file data. Database clusters | ||
10 | * (multiple copies of the same instance running on separate hosts) | ||
11 | * implement their own cache coherency protocol that subsumes file | ||
12 | * system cache protocols. Applications that process datasets | ||
13 | * considerably larger than the client's memory do not always benefit | ||
14 | * from a local cache. A streaming video server, for instance, has no | ||
15 | * need to cache the contents of a file. | ||
16 | * | ||
17 | * When an application requests uncached I/O, all read and write requests | ||
18 | * are made directly to the server; data stored or fetched via these | ||
19 | * requests is not cached in the Linux page cache. The client does not | ||
20 | * correct unaligned requests from applications. All requested bytes are | ||
21 | * held on permanent storage before a direct write system call returns to | ||
22 | * an application. | ||
23 | * | ||
24 | * Solaris implements an uncached I/O facility called directio() that | ||
25 | * is used for backups and sequential I/O to very large files. Solaris | ||
26 | * also supports uncaching whole NFS partitions with "-o forcedirectio," | ||
27 | * an undocumented mount option. | ||
28 | * | ||
29 | * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with | ||
30 | * help from Andrew Morton. | ||
31 | * | ||
32 | * 18 Dec 2001 Initial implementation for 2.4 --cel | ||
33 | * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy | ||
34 | * 08 Jun 2003 Port to 2.5 APIs --cel | ||
35 | * 31 Mar 2004 Handle direct I/O without VFS support --cel | ||
36 | * 15 Sep 2004 Parallel async reads --cel | ||
37 | * | ||
38 | */ | ||
39 | |||
40 | #include <linux/config.h> | ||
41 | #include <linux/errno.h> | ||
42 | #include <linux/sched.h> | ||
43 | #include <linux/kernel.h> | ||
44 | #include <linux/smp_lock.h> | ||
45 | #include <linux/file.h> | ||
46 | #include <linux/pagemap.h> | ||
47 | #include <linux/kref.h> | ||
48 | |||
49 | #include <linux/nfs_fs.h> | ||
50 | #include <linux/nfs_page.h> | ||
51 | #include <linux/sunrpc/clnt.h> | ||
52 | |||
53 | #include <asm/system.h> | ||
54 | #include <asm/uaccess.h> | ||
55 | #include <asm/atomic.h> | ||
56 | |||
57 | #define NFSDBG_FACILITY NFSDBG_VFS | ||
58 | #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) | ||
59 | |||
60 | static kmem_cache_t *nfs_direct_cachep; | ||
61 | |||
62 | /* | ||
63 | * This represents a set of asynchronous requests that we're waiting on | ||
64 | */ | ||
65 | struct nfs_direct_req { | ||
66 | struct kref kref; /* release manager */ | ||
67 | struct list_head list; /* nfs_read_data structs */ | ||
68 | wait_queue_head_t wait; /* wait for i/o completion */ | ||
69 | struct page ** pages; /* pages in our buffer */ | ||
70 | unsigned int npages; /* count of pages */ | ||
71 | atomic_t complete, /* i/os we're waiting for */ | ||
72 | count, /* bytes actually processed */ | ||
73 | error; /* any reported error */ | ||
74 | }; | ||
75 | |||
76 | |||
77 | /** | ||
78 | * nfs_get_user_pages - find and set up pages underlying user's buffer | ||
79 | * rw: direction (read or write) | ||
80 | * user_addr: starting address of this segment of user's buffer | ||
81 | * count: size of this segment | ||
82 | * @pages: returned array of page struct pointers underlying user's buffer | ||
83 | */ | ||
84 | static inline int | ||
85 | nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, | ||
86 | struct page ***pages) | ||
87 | { | ||
88 | int result = -ENOMEM; | ||
89 | unsigned long page_count; | ||
90 | size_t array_size; | ||
91 | |||
92 | /* set an arbitrary limit to prevent type overflow */ | ||
93 | /* XXX: this can probably be as large as INT_MAX */ | ||
94 | if (size > MAX_DIRECTIO_SIZE) { | ||
95 | *pages = NULL; | ||
96 | return -EFBIG; | ||
97 | } | ||
98 | |||
99 | page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
100 | page_count -= user_addr >> PAGE_SHIFT; | ||
101 | |||
102 | array_size = (page_count * sizeof(struct page *)); | ||
103 | *pages = kmalloc(array_size, GFP_KERNEL); | ||
104 | if (*pages) { | ||
105 | down_read(¤t->mm->mmap_sem); | ||
106 | result = get_user_pages(current, current->mm, user_addr, | ||
107 | page_count, (rw == READ), 0, | ||
108 | *pages, NULL); | ||
109 | up_read(¤t->mm->mmap_sem); | ||
110 | } | ||
111 | return result; | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * nfs_free_user_pages - tear down page struct array | ||
116 | * @pages: array of page struct pointers underlying target buffer | ||
117 | * @npages: number of pages in the array | ||
118 | * @do_dirty: dirty the pages as we release them | ||
119 | */ | ||
120 | static void | ||
121 | nfs_free_user_pages(struct page **pages, int npages, int do_dirty) | ||
122 | { | ||
123 | int i; | ||
124 | for (i = 0; i < npages; i++) { | ||
125 | if (do_dirty) | ||
126 | set_page_dirty_lock(pages[i]); | ||
127 | page_cache_release(pages[i]); | ||
128 | } | ||
129 | kfree(pages); | ||
130 | } | ||
131 | |||
132 | /** | ||
133 | * nfs_direct_req_release - release nfs_direct_req structure for direct read | ||
134 | * @kref: kref object embedded in an nfs_direct_req structure | ||
135 | * | ||
136 | */ | ||
137 | static void nfs_direct_req_release(struct kref *kref) | ||
138 | { | ||
139 | struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); | ||
140 | kmem_cache_free(nfs_direct_cachep, dreq); | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read | ||
145 | * @count: count of bytes for the read request | ||
146 | * @rsize: local rsize setting | ||
147 | * | ||
148 | * Note we also set the number of requests we have in the dreq when we are | ||
149 | * done. This prevents races with I/O completion so we will always wait | ||
150 | * until all requests have been dispatched and completed. | ||
151 | */ | ||
152 | static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize) | ||
153 | { | ||
154 | struct list_head *list; | ||
155 | struct nfs_direct_req *dreq; | ||
156 | unsigned int reads = 0; | ||
157 | |||
158 | dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); | ||
159 | if (!dreq) | ||
160 | return NULL; | ||
161 | |||
162 | kref_init(&dreq->kref); | ||
163 | init_waitqueue_head(&dreq->wait); | ||
164 | INIT_LIST_HEAD(&dreq->list); | ||
165 | atomic_set(&dreq->count, 0); | ||
166 | atomic_set(&dreq->error, 0); | ||
167 | |||
168 | list = &dreq->list; | ||
169 | for(;;) { | ||
170 | struct nfs_read_data *data = nfs_readdata_alloc(); | ||
171 | |||
172 | if (unlikely(!data)) { | ||
173 | while (!list_empty(list)) { | ||
174 | data = list_entry(list->next, | ||
175 | struct nfs_read_data, pages); | ||
176 | list_del(&data->pages); | ||
177 | nfs_readdata_free(data); | ||
178 | } | ||
179 | kref_put(&dreq->kref, nfs_direct_req_release); | ||
180 | return NULL; | ||
181 | } | ||
182 | |||
183 | INIT_LIST_HEAD(&data->pages); | ||
184 | list_add(&data->pages, list); | ||
185 | |||
186 | data->req = (struct nfs_page *) dreq; | ||
187 | reads++; | ||
188 | if (nbytes <= rsize) | ||
189 | break; | ||
190 | nbytes -= rsize; | ||
191 | } | ||
192 | kref_get(&dreq->kref); | ||
193 | atomic_set(&dreq->complete, reads); | ||
194 | return dreq; | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * nfs_direct_read_result - handle a read reply for a direct read request | ||
199 | * @data: address of NFS READ operation control block | ||
200 | * @status: status of this NFS READ operation | ||
201 | * | ||
202 | * We must hold a reference to all the pages in this direct read request | ||
203 | * until the RPCs complete. This could be long *after* we are woken up in | ||
204 | * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server). | ||
205 | */ | ||
206 | static void nfs_direct_read_result(struct nfs_read_data *data, int status) | ||
207 | { | ||
208 | struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; | ||
209 | |||
210 | if (likely(status >= 0)) | ||
211 | atomic_add(data->res.count, &dreq->count); | ||
212 | else | ||
213 | atomic_set(&dreq->error, status); | ||
214 | |||
215 | if (unlikely(atomic_dec_and_test(&dreq->complete))) { | ||
216 | nfs_free_user_pages(dreq->pages, dreq->npages, 1); | ||
217 | wake_up(&dreq->wait); | ||
218 | kref_put(&dreq->kref, nfs_direct_req_release); | ||
219 | } | ||
220 | } | ||
221 | |||
222 | /** | ||
223 | * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read | ||
224 | * @dreq: address of nfs_direct_req struct for this request | ||
225 | * @inode: target inode | ||
226 | * @ctx: target file open context | ||
227 | * @user_addr: starting address of this segment of user's buffer | ||
228 | * @count: size of this segment | ||
229 | * @file_offset: offset in file to begin the operation | ||
230 | * | ||
231 | * For each nfs_read_data struct that was allocated on the list, dispatch | ||
232 | * an NFS READ operation | ||
233 | */ | ||
234 | static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, | ||
235 | struct inode *inode, struct nfs_open_context *ctx, | ||
236 | unsigned long user_addr, size_t count, loff_t file_offset) | ||
237 | { | ||
238 | struct list_head *list = &dreq->list; | ||
239 | struct page **pages = dreq->pages; | ||
240 | unsigned int curpage, pgbase; | ||
241 | unsigned int rsize = NFS_SERVER(inode)->rsize; | ||
242 | |||
243 | curpage = 0; | ||
244 | pgbase = user_addr & ~PAGE_MASK; | ||
245 | do { | ||
246 | struct nfs_read_data *data; | ||
247 | unsigned int bytes; | ||
248 | |||
249 | bytes = rsize; | ||
250 | if (count < rsize) | ||
251 | bytes = count; | ||
252 | |||
253 | data = list_entry(list->next, struct nfs_read_data, pages); | ||
254 | list_del_init(&data->pages); | ||
255 | |||
256 | data->inode = inode; | ||
257 | data->cred = ctx->cred; | ||
258 | data->args.fh = NFS_FH(inode); | ||
259 | data->args.context = ctx; | ||
260 | data->args.offset = file_offset; | ||
261 | data->args.pgbase = pgbase; | ||
262 | data->args.pages = &pages[curpage]; | ||
263 | data->args.count = bytes; | ||
264 | data->res.fattr = &data->fattr; | ||
265 | data->res.eof = 0; | ||
266 | data->res.count = bytes; | ||
267 | |||
268 | NFS_PROTO(inode)->read_setup(data); | ||
269 | |||
270 | data->task.tk_cookie = (unsigned long) inode; | ||
271 | data->task.tk_calldata = data; | ||
272 | data->task.tk_release = nfs_readdata_release; | ||
273 | data->complete = nfs_direct_read_result; | ||
274 | |||
275 | lock_kernel(); | ||
276 | rpc_execute(&data->task); | ||
277 | unlock_kernel(); | ||
278 | |||
279 | dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n", | ||
280 | data->task.tk_pid, | ||
281 | inode->i_sb->s_id, | ||
282 | (long long)NFS_FILEID(inode), | ||
283 | bytes, | ||
284 | (unsigned long long)data->args.offset); | ||
285 | |||
286 | file_offset += bytes; | ||
287 | pgbase += bytes; | ||
288 | curpage += pgbase >> PAGE_SHIFT; | ||
289 | pgbase &= ~PAGE_MASK; | ||
290 | |||
291 | count -= bytes; | ||
292 | } while (count != 0); | ||
293 | } | ||
294 | |||
295 | /** | ||
296 | * nfs_direct_read_wait - wait for I/O completion for direct reads | ||
297 | * @dreq: request on which we are to wait | ||
298 | * @intr: whether or not this wait can be interrupted | ||
299 | * | ||
300 | * Collects and returns the final error value/byte-count. | ||
301 | */ | ||
302 | static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr) | ||
303 | { | ||
304 | int result = 0; | ||
305 | |||
306 | if (intr) { | ||
307 | result = wait_event_interruptible(dreq->wait, | ||
308 | (atomic_read(&dreq->complete) == 0)); | ||
309 | } else { | ||
310 | wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0)); | ||
311 | } | ||
312 | |||
313 | if (!result) | ||
314 | result = atomic_read(&dreq->error); | ||
315 | if (!result) | ||
316 | result = atomic_read(&dreq->count); | ||
317 | |||
318 | kref_put(&dreq->kref, nfs_direct_req_release); | ||
319 | return (ssize_t) result; | ||
320 | } | ||
321 | |||
322 | /** | ||
323 | * nfs_direct_read_seg - Read in one iov segment. Generate separate | ||
324 | * read RPCs for each "rsize" bytes. | ||
325 | * @inode: target inode | ||
326 | * @ctx: target file open context | ||
327 | * @user_addr: starting address of this segment of user's buffer | ||
328 | * @count: size of this segment | ||
329 | * @file_offset: offset in file to begin the operation | ||
330 | * @pages: array of addresses of page structs defining user's buffer | ||
331 | * @nr_pages: number of pages in the array | ||
332 | * | ||
333 | */ | ||
334 | static ssize_t nfs_direct_read_seg(struct inode *inode, | ||
335 | struct nfs_open_context *ctx, unsigned long user_addr, | ||
336 | size_t count, loff_t file_offset, struct page **pages, | ||
337 | unsigned int nr_pages) | ||
338 | { | ||
339 | ssize_t result; | ||
340 | sigset_t oldset; | ||
341 | struct rpc_clnt *clnt = NFS_CLIENT(inode); | ||
342 | struct nfs_direct_req *dreq; | ||
343 | |||
344 | dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize); | ||
345 | if (!dreq) | ||
346 | return -ENOMEM; | ||
347 | |||
348 | dreq->pages = pages; | ||
349 | dreq->npages = nr_pages; | ||
350 | |||
351 | rpc_clnt_sigmask(clnt, &oldset); | ||
352 | nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count, | ||
353 | file_offset); | ||
354 | result = nfs_direct_read_wait(dreq, clnt->cl_intr); | ||
355 | rpc_clnt_sigunmask(clnt, &oldset); | ||
356 | |||
357 | return result; | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * nfs_direct_read - For each iov segment, map the user's buffer | ||
362 | * then generate read RPCs. | ||
363 | * @inode: target inode | ||
364 | * @ctx: target file open context | ||
365 | * @iov: array of vectors that define I/O buffer | ||
366 | * file_offset: offset in file to begin the operation | ||
367 | * nr_segs: size of iovec array | ||
368 | * | ||
369 | * We've already pushed out any non-direct writes so that this read | ||
370 | * will see them when we read from the server. | ||
371 | */ | ||
372 | static ssize_t | ||
373 | nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx, | ||
374 | const struct iovec *iov, loff_t file_offset, | ||
375 | unsigned long nr_segs) | ||
376 | { | ||
377 | ssize_t tot_bytes = 0; | ||
378 | unsigned long seg = 0; | ||
379 | |||
380 | while ((seg < nr_segs) && (tot_bytes >= 0)) { | ||
381 | ssize_t result; | ||
382 | int page_count; | ||
383 | struct page **pages; | ||
384 | const struct iovec *vec = &iov[seg++]; | ||
385 | unsigned long user_addr = (unsigned long) vec->iov_base; | ||
386 | size_t size = vec->iov_len; | ||
387 | |||
388 | page_count = nfs_get_user_pages(READ, user_addr, size, &pages); | ||
389 | if (page_count < 0) { | ||
390 | nfs_free_user_pages(pages, 0, 0); | ||
391 | if (tot_bytes > 0) | ||
392 | break; | ||
393 | return page_count; | ||
394 | } | ||
395 | |||
396 | result = nfs_direct_read_seg(inode, ctx, user_addr, size, | ||
397 | file_offset, pages, page_count); | ||
398 | |||
399 | if (result <= 0) { | ||
400 | if (tot_bytes > 0) | ||
401 | break; | ||
402 | return result; | ||
403 | } | ||
404 | tot_bytes += result; | ||
405 | file_offset += result; | ||
406 | if (result < size) | ||
407 | break; | ||
408 | } | ||
409 | |||
410 | return tot_bytes; | ||
411 | } | ||
412 | |||
413 | /** | ||
414 | * nfs_direct_write_seg - Write out one iov segment. Generate separate | ||
415 | * write RPCs for each "wsize" bytes, then commit. | ||
416 | * @inode: target inode | ||
417 | * @ctx: target file open context | ||
418 | * user_addr: starting address of this segment of user's buffer | ||
419 | * count: size of this segment | ||
420 | * file_offset: offset in file to begin the operation | ||
421 | * @pages: array of addresses of page structs defining user's buffer | ||
422 | * nr_pages: size of pages array | ||
423 | */ | ||
424 | static ssize_t nfs_direct_write_seg(struct inode *inode, | ||
425 | struct nfs_open_context *ctx, unsigned long user_addr, | ||
426 | size_t count, loff_t file_offset, struct page **pages, | ||
427 | int nr_pages) | ||
428 | { | ||
429 | const unsigned int wsize = NFS_SERVER(inode)->wsize; | ||
430 | size_t request; | ||
431 | int curpage, need_commit; | ||
432 | ssize_t result, tot_bytes; | ||
433 | struct nfs_writeverf first_verf; | ||
434 | struct nfs_write_data *wdata; | ||
435 | |||
436 | wdata = nfs_writedata_alloc(); | ||
437 | if (!wdata) | ||
438 | return -ENOMEM; | ||
439 | |||
440 | wdata->inode = inode; | ||
441 | wdata->cred = ctx->cred; | ||
442 | wdata->args.fh = NFS_FH(inode); | ||
443 | wdata->args.context = ctx; | ||
444 | wdata->args.stable = NFS_UNSTABLE; | ||
445 | if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) | ||
446 | wdata->args.stable = NFS_FILE_SYNC; | ||
447 | wdata->res.fattr = &wdata->fattr; | ||
448 | wdata->res.verf = &wdata->verf; | ||
449 | |||
450 | nfs_begin_data_update(inode); | ||
451 | retry: | ||
452 | need_commit = 0; | ||
453 | tot_bytes = 0; | ||
454 | curpage = 0; | ||
455 | request = count; | ||
456 | wdata->args.pgbase = user_addr & ~PAGE_MASK; | ||
457 | wdata->args.offset = file_offset; | ||
458 | do { | ||
459 | wdata->args.count = request; | ||
460 | if (wdata->args.count > wsize) | ||
461 | wdata->args.count = wsize; | ||
462 | wdata->args.pages = &pages[curpage]; | ||
463 | |||
464 | dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", | ||
465 | wdata->args.count, (long long) wdata->args.offset, | ||
466 | user_addr + tot_bytes, wdata->args.pgbase, curpage); | ||
467 | |||
468 | lock_kernel(); | ||
469 | result = NFS_PROTO(inode)->write(wdata); | ||
470 | unlock_kernel(); | ||
471 | |||
472 | if (result <= 0) { | ||
473 | if (tot_bytes > 0) | ||
474 | break; | ||
475 | goto out; | ||
476 | } | ||
477 | |||
478 | if (tot_bytes == 0) | ||
479 | memcpy(&first_verf.verifier, &wdata->verf.verifier, | ||
480 | sizeof(first_verf.verifier)); | ||
481 | if (wdata->verf.committed != NFS_FILE_SYNC) { | ||
482 | need_commit = 1; | ||
483 | if (memcmp(&first_verf.verifier, &wdata->verf.verifier, | ||
484 | sizeof(first_verf.verifier))); | ||
485 | goto sync_retry; | ||
486 | } | ||
487 | |||
488 | tot_bytes += result; | ||
489 | |||
490 | /* in case of a short write: stop now, let the app recover */ | ||
491 | if (result < wdata->args.count) | ||
492 | break; | ||
493 | |||
494 | wdata->args.offset += result; | ||
495 | wdata->args.pgbase += result; | ||
496 | curpage += wdata->args.pgbase >> PAGE_SHIFT; | ||
497 | wdata->args.pgbase &= ~PAGE_MASK; | ||
498 | request -= result; | ||
499 | } while (request != 0); | ||
500 | |||
501 | /* | ||
502 | * Commit data written so far, even in the event of an error | ||
503 | */ | ||
504 | if (need_commit) { | ||
505 | wdata->args.count = tot_bytes; | ||
506 | wdata->args.offset = file_offset; | ||
507 | |||
508 | lock_kernel(); | ||
509 | result = NFS_PROTO(inode)->commit(wdata); | ||
510 | unlock_kernel(); | ||
511 | |||
512 | if (result < 0 || memcmp(&first_verf.verifier, | ||
513 | &wdata->verf.verifier, | ||
514 | sizeof(first_verf.verifier)) != 0) | ||
515 | goto sync_retry; | ||
516 | } | ||
517 | result = tot_bytes; | ||
518 | |||
519 | out: | ||
520 | nfs_end_data_update_defer(inode); | ||
521 | nfs_writedata_free(wdata); | ||
522 | return result; | ||
523 | |||
524 | sync_retry: | ||
525 | wdata->args.stable = NFS_FILE_SYNC; | ||
526 | goto retry; | ||
527 | } | ||
528 | |||
529 | /** | ||
530 | * nfs_direct_write - For each iov segment, map the user's buffer | ||
531 | * then generate write and commit RPCs. | ||
532 | * @inode: target inode | ||
533 | * @ctx: target file open context | ||
534 | * @iov: array of vectors that define I/O buffer | ||
535 | * file_offset: offset in file to begin the operation | ||
536 | * nr_segs: size of iovec array | ||
537 | * | ||
538 | * Upon return, generic_file_direct_IO invalidates any cached pages | ||
539 | * that non-direct readers might access, so they will pick up these | ||
540 | * writes immediately. | ||
541 | */ | ||
542 | static ssize_t nfs_direct_write(struct inode *inode, | ||
543 | struct nfs_open_context *ctx, const struct iovec *iov, | ||
544 | loff_t file_offset, unsigned long nr_segs) | ||
545 | { | ||
546 | ssize_t tot_bytes = 0; | ||
547 | unsigned long seg = 0; | ||
548 | |||
549 | while ((seg < nr_segs) && (tot_bytes >= 0)) { | ||
550 | ssize_t result; | ||
551 | int page_count; | ||
552 | struct page **pages; | ||
553 | const struct iovec *vec = &iov[seg++]; | ||
554 | unsigned long user_addr = (unsigned long) vec->iov_base; | ||
555 | size_t size = vec->iov_len; | ||
556 | |||
557 | page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); | ||
558 | if (page_count < 0) { | ||
559 | nfs_free_user_pages(pages, 0, 0); | ||
560 | if (tot_bytes > 0) | ||
561 | break; | ||
562 | return page_count; | ||
563 | } | ||
564 | |||
565 | result = nfs_direct_write_seg(inode, ctx, user_addr, size, | ||
566 | file_offset, pages, page_count); | ||
567 | nfs_free_user_pages(pages, page_count, 0); | ||
568 | |||
569 | if (result <= 0) { | ||
570 | if (tot_bytes > 0) | ||
571 | break; | ||
572 | return result; | ||
573 | } | ||
574 | tot_bytes += result; | ||
575 | file_offset += result; | ||
576 | if (result < size) | ||
577 | break; | ||
578 | } | ||
579 | return tot_bytes; | ||
580 | } | ||
581 | |||
582 | /** | ||
583 | * nfs_direct_IO - NFS address space operation for direct I/O | ||
584 | * rw: direction (read or write) | ||
585 | * @iocb: target I/O control block | ||
586 | * @iov: array of vectors that define I/O buffer | ||
587 | * file_offset: offset in file to begin the operation | ||
588 | * nr_segs: size of iovec array | ||
589 | * | ||
590 | */ | ||
591 | ssize_t | ||
592 | nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
593 | loff_t file_offset, unsigned long nr_segs) | ||
594 | { | ||
595 | ssize_t result = -EINVAL; | ||
596 | struct file *file = iocb->ki_filp; | ||
597 | struct nfs_open_context *ctx; | ||
598 | struct dentry *dentry = file->f_dentry; | ||
599 | struct inode *inode = dentry->d_inode; | ||
600 | |||
601 | /* | ||
602 | * No support for async yet | ||
603 | */ | ||
604 | if (!is_sync_kiocb(iocb)) | ||
605 | return result; | ||
606 | |||
607 | ctx = (struct nfs_open_context *)file->private_data; | ||
608 | switch (rw) { | ||
609 | case READ: | ||
610 | dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", | ||
611 | dentry->d_name.name, file_offset, nr_segs); | ||
612 | |||
613 | result = nfs_direct_read(inode, ctx, iov, | ||
614 | file_offset, nr_segs); | ||
615 | break; | ||
616 | case WRITE: | ||
617 | dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", | ||
618 | dentry->d_name.name, file_offset, nr_segs); | ||
619 | |||
620 | result = nfs_direct_write(inode, ctx, iov, | ||
621 | file_offset, nr_segs); | ||
622 | break; | ||
623 | default: | ||
624 | break; | ||
625 | } | ||
626 | return result; | ||
627 | } | ||
628 | |||
629 | /** | ||
630 | * nfs_file_direct_read - file direct read operation for NFS files | ||
631 | * @iocb: target I/O control block | ||
632 | * @buf: user's buffer into which to read data | ||
633 | * count: number of bytes to read | ||
634 | * pos: byte offset in file where reading starts | ||
635 | * | ||
636 | * We use this function for direct reads instead of calling | ||
637 | * generic_file_aio_read() in order to avoid gfar's check to see if | ||
638 | * the request starts before the end of the file. For that check | ||
639 | * to work, we must generate a GETATTR before each direct read, and | ||
640 | * even then there is a window between the GETATTR and the subsequent | ||
641 | * READ where the file size could change. So our preference is simply | ||
642 | * to do all reads the application wants, and the server will take | ||
643 | * care of managing the end of file boundary. | ||
644 | * | ||
645 | * This function also eliminates unnecessarily updating the file's | ||
646 | * atime locally, as the NFS server sets the file's atime, and this | ||
647 | * client must read the updated atime from the server back into its | ||
648 | * cache. | ||
649 | */ | ||
650 | ssize_t | ||
651 | nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) | ||
652 | { | ||
653 | ssize_t retval = -EINVAL; | ||
654 | loff_t *ppos = &iocb->ki_pos; | ||
655 | struct file *file = iocb->ki_filp; | ||
656 | struct nfs_open_context *ctx = | ||
657 | (struct nfs_open_context *) file->private_data; | ||
658 | struct dentry *dentry = file->f_dentry; | ||
659 | struct address_space *mapping = file->f_mapping; | ||
660 | struct inode *inode = mapping->host; | ||
661 | struct iovec iov = { | ||
662 | .iov_base = buf, | ||
663 | .iov_len = count, | ||
664 | }; | ||
665 | |||
666 | dprintk("nfs: direct read(%s/%s, %lu@%lu)\n", | ||
667 | dentry->d_parent->d_name.name, dentry->d_name.name, | ||
668 | (unsigned long) count, (unsigned long) pos); | ||
669 | |||
670 | if (!is_sync_kiocb(iocb)) | ||
671 | goto out; | ||
672 | if (count < 0) | ||
673 | goto out; | ||
674 | retval = -EFAULT; | ||
675 | if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len)) | ||
676 | goto out; | ||
677 | retval = 0; | ||
678 | if (!count) | ||
679 | goto out; | ||
680 | |||
681 | if (mapping->nrpages) { | ||
682 | retval = filemap_fdatawrite(mapping); | ||
683 | if (retval == 0) | ||
684 | retval = nfs_wb_all(inode); | ||
685 | if (retval == 0) | ||
686 | retval = filemap_fdatawait(mapping); | ||
687 | if (retval) | ||
688 | goto out; | ||
689 | } | ||
690 | |||
691 | retval = nfs_direct_read(inode, ctx, &iov, pos, 1); | ||
692 | if (retval > 0) | ||
693 | *ppos = pos + retval; | ||
694 | |||
695 | out: | ||
696 | return retval; | ||
697 | } | ||
698 | |||
699 | /** | ||
700 | * nfs_file_direct_write - file direct write operation for NFS files | ||
701 | * @iocb: target I/O control block | ||
702 | * @buf: user's buffer from which to write data | ||
703 | * count: number of bytes to write | ||
704 | * pos: byte offset in file where writing starts | ||
705 | * | ||
706 | * We use this function for direct writes instead of calling | ||
707 | * generic_file_aio_write() in order to avoid taking the inode | ||
708 | * semaphore and updating the i_size. The NFS server will set | ||
709 | * the new i_size and this client must read the updated size | ||
710 | * back into its cache. We let the server do generic write | ||
711 | * parameter checking and report problems. | ||
712 | * | ||
713 | * We also avoid an unnecessary invocation of generic_osync_inode(), | ||
714 | * as it is fairly meaningless to sync the metadata of an NFS file. | ||
715 | * | ||
716 | * We eliminate local atime updates, see direct read above. | ||
717 | * | ||
718 | * We avoid unnecessary page cache invalidations for normal cached | ||
719 | * readers of this file. | ||
720 | * | ||
721 | * Note that O_APPEND is not supported for NFS direct writes, as there | ||
722 | * is no atomic O_APPEND write facility in the NFS protocol. | ||
723 | */ | ||
724 | ssize_t | ||
725 | nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) | ||
726 | { | ||
727 | ssize_t retval = -EINVAL; | ||
728 | loff_t *ppos = &iocb->ki_pos; | ||
729 | unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
730 | struct file *file = iocb->ki_filp; | ||
731 | struct nfs_open_context *ctx = | ||
732 | (struct nfs_open_context *) file->private_data; | ||
733 | struct dentry *dentry = file->f_dentry; | ||
734 | struct address_space *mapping = file->f_mapping; | ||
735 | struct inode *inode = mapping->host; | ||
736 | struct iovec iov = { | ||
737 | .iov_base = (char __user *)buf, | ||
738 | .iov_len = count, | ||
739 | }; | ||
740 | |||
741 | dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n", | ||
742 | dentry->d_parent->d_name.name, dentry->d_name.name, | ||
743 | inode->i_ino, (unsigned long) count, (unsigned long) pos); | ||
744 | |||
745 | if (!is_sync_kiocb(iocb)) | ||
746 | goto out; | ||
747 | if (count < 0) | ||
748 | goto out; | ||
749 | if (pos < 0) | ||
750 | goto out; | ||
751 | retval = -EFAULT; | ||
752 | if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) | ||
753 | goto out; | ||
754 | if (file->f_error) { | ||
755 | retval = file->f_error; | ||
756 | file->f_error = 0; | ||
757 | goto out; | ||
758 | } | ||
759 | retval = -EFBIG; | ||
760 | if (limit != RLIM_INFINITY) { | ||
761 | if (pos >= limit) { | ||
762 | send_sig(SIGXFSZ, current, 0); | ||
763 | goto out; | ||
764 | } | ||
765 | if (count > limit - (unsigned long) pos) | ||
766 | count = limit - (unsigned long) pos; | ||
767 | } | ||
768 | retval = 0; | ||
769 | if (!count) | ||
770 | goto out; | ||
771 | |||
772 | if (mapping->nrpages) { | ||
773 | retval = filemap_fdatawrite(mapping); | ||
774 | if (retval == 0) | ||
775 | retval = nfs_wb_all(inode); | ||
776 | if (retval == 0) | ||
777 | retval = filemap_fdatawait(mapping); | ||
778 | if (retval) | ||
779 | goto out; | ||
780 | } | ||
781 | |||
782 | retval = nfs_direct_write(inode, ctx, &iov, pos, 1); | ||
783 | if (mapping->nrpages) | ||
784 | invalidate_inode_pages2(mapping); | ||
785 | if (retval > 0) | ||
786 | *ppos = pos + retval; | ||
787 | |||
788 | out: | ||
789 | return retval; | ||
790 | } | ||
791 | |||
792 | int nfs_init_directcache(void) | ||
793 | { | ||
794 | nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", | ||
795 | sizeof(struct nfs_direct_req), | ||
796 | 0, SLAB_RECLAIM_ACCOUNT, | ||
797 | NULL, NULL); | ||
798 | if (nfs_direct_cachep == NULL) | ||
799 | return -ENOMEM; | ||
800 | |||
801 | return 0; | ||
802 | } | ||
803 | |||
804 | void nfs_destroy_directcache(void) | ||
805 | { | ||
806 | if (kmem_cache_destroy(nfs_direct_cachep)) | ||
807 | printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); | ||
808 | } | ||