aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChuck Lever <cel@netapp.com>2006-06-20 12:56:49 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2006-06-24 13:11:39 -0400
commit06cf6f2ed0b19629700794727d86ed57b9c0583e (patch)
tree20630465974dc9391af486d8609aae077701b596 /fs
parent9c93ab7dff5eb22027ab15010557bb73f9b44c99 (diff)
NFS: Eliminate nfs_get_user_pages()
Neil Brown observed that the kmalloc() in nfs_get_user_pages() is more likely to fail if the I/O is large enough to require the allocation of more than a single page to keep track of all the pinned pages in the user's buffer. Instead of tracking one large page array per dreq/iocb, track pages per nfs_read/write_data, just like the cached I/O path does. An array for pages is already allocated for us by nfs_readdata_alloc() (and the write and commit equivalents). This is also required for adding support for vectored I/O to the NFS direct I/O path. The original reason to pin the user buffer and allocate all the NFS data structures before trying to schedule I/O was to ensure all needed resources are allocated on the client before starting to send requests. This reduces the chance that resource exhaustion on the client will cause a short read or write. On the other hand, for an application making very large application I/O requests, this means that it will be nearly impossible for the application to make forward progress on a resource-limited client. Thus, moving the buffer pinning functionality into the I/O scheduling loops should be good for scalability. The next patch will do the same for NFS data structure allocation. Signed-off-by: Chuck Lever <cel@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/nfs/direct.c205
1 files changed, 111 insertions, 94 deletions
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4cb3446220ba..b1630d53fbb1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -73,8 +73,6 @@ struct nfs_direct_req {
73 struct nfs_open_context *ctx; /* file open context info */ 73 struct nfs_open_context *ctx; /* file open context info */
74 struct kiocb * iocb; /* controlling i/o request */ 74 struct kiocb * iocb; /* controlling i/o request */
75 struct inode * inode; /* target file of i/o */ 75 struct inode * inode; /* target file of i/o */
76 struct page ** pages; /* pages in our buffer */
77 unsigned int npages; /* count of pages */
78 76
79 /* completion state */ 77 /* completion state */
80 atomic_t io_count; /* i/os we're waiting for */ 78 atomic_t io_count; /* i/os we're waiting for */
@@ -104,6 +102,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
104 return atomic_dec_and_test(&dreq->io_count); 102 return atomic_dec_and_test(&dreq->io_count);
105} 103}
106 104
105/*
106 * "size" is never larger than rsize or wsize.
107 */
108static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
109{
110 int page_count;
111
112 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
113 page_count -= user_addr >> PAGE_SHIFT;
114 BUG_ON(page_count < 0);
115
116 return page_count;
117}
118
107/** 119/**
108 * nfs_direct_IO - NFS address space operation for direct I/O 120 * nfs_direct_IO - NFS address space operation for direct I/O
109 * @rw: direction (read or write) 121 * @rw: direction (read or write)
@@ -143,40 +155,6 @@ static void nfs_direct_release_pages(struct page **pages, int npages)
143 page_cache_release(pages[i]); 155 page_cache_release(pages[i]);
144} 156}
145 157
146static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
147{
148 int result = -ENOMEM;
149 unsigned long page_count;
150 size_t array_size;
151
152 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
153 page_count -= user_addr >> PAGE_SHIFT;
154
155 array_size = (page_count * sizeof(struct page *));
156 *pages = kmalloc(array_size, GFP_KERNEL);
157 if (*pages) {
158 down_read(&current->mm->mmap_sem);
159 result = get_user_pages(current, current->mm, user_addr,
160 page_count, (rw == READ), 0,
161 *pages, NULL);
162 up_read(&current->mm->mmap_sem);
163 if (result != page_count) {
164 /*
165 * If we got fewer pages than expected from
166 * get_user_pages(), the user buffer runs off the
167 * end of a mapping; return EFAULT.
168 */
169 if (result >= 0) {
170 nfs_direct_release_pages(*pages, result);
171 result = -EFAULT;
172 } else
173 kfree(*pages);
174 *pages = NULL;
175 }
176 }
177 return result;
178}
179
180static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 158static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
181{ 159{
182 struct nfs_direct_req *dreq; 160 struct nfs_direct_req *dreq;
@@ -233,13 +211,8 @@ out:
233} 211}
234 212
235/* 213/*
236 * We must hold a reference to all the pages in this direct read request 214 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
237 * until the RPCs complete. This could be long *after* we are woken up in 215 * the iocb is still valid here if this is a synchronous request.
238 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
239 *
240 * In addition, synchronous I/O uses a stack-allocated iocb. Thus we
241 * can't trust the iocb is still valid here if this is a synchronous
242 * request. If the waiter is woken prematurely, the iocb is long gone.
243 */ 216 */
244static void nfs_direct_complete(struct nfs_direct_req *dreq) 217static void nfs_direct_complete(struct nfs_direct_req *dreq)
245{ 218{
@@ -297,6 +270,11 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
297 return dreq; 270 return dreq;
298} 271}
299 272
273/*
274 * We must hold a reference to all the pages in this direct read request
275 * until the RPCs complete. This could be long *after* we are woken up in
276 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
277 */
300static void nfs_direct_read_result(struct rpc_task *task, void *calldata) 278static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
301{ 279{
302 struct nfs_read_data *data = calldata; 280 struct nfs_read_data *data = calldata;
@@ -305,6 +283,9 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
305 if (nfs_readpage_result(task, data) != 0) 283 if (nfs_readpage_result(task, data) != 0)
306 return; 284 return;
307 285
286 nfs_direct_dirty_pages(data->pagevec, data->npages);
287 nfs_direct_release_pages(data->pagevec, data->npages);
288
308 spin_lock(&dreq->lock); 289 spin_lock(&dreq->lock);
309 290
310 if (likely(task->tk_status >= 0)) 291 if (likely(task->tk_status >= 0))
@@ -314,11 +295,8 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
314 295
315 spin_unlock(&dreq->lock); 296 spin_unlock(&dreq->lock);
316 297
317 if (put_dreq(dreq)) { 298 if (put_dreq(dreq))
318 nfs_direct_dirty_pages(dreq->pages, dreq->npages);
319 nfs_direct_release_pages(dreq->pages, dreq->npages);
320 nfs_direct_complete(dreq); 299 nfs_direct_complete(dreq);
321 }
322} 300}
323 301
324static const struct rpc_call_ops nfs_read_direct_ops = { 302static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -328,21 +306,23 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
328 306
329/* 307/*
330 * For each nfs_read_data struct that was allocated on the list, dispatch 308 * For each nfs_read_data struct that was allocated on the list, dispatch
331 * an NFS READ operation 309 * an NFS READ operation. If get_user_pages() fails, we stop sending reads.
310 * Read length accounting is handled by nfs_direct_read_result().
311 * Otherwise, if no requests have been sent, just return an error.
332 */ 312 */
333static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) 313static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
334{ 314{
335 struct nfs_open_context *ctx = dreq->ctx; 315 struct nfs_open_context *ctx = dreq->ctx;
336 struct inode *inode = ctx->dentry->d_inode; 316 struct inode *inode = ctx->dentry->d_inode;
337 struct list_head *list = &dreq->list; 317 struct list_head *list = &dreq->list;
338 struct page **pages = dreq->pages;
339 size_t rsize = NFS_SERVER(inode)->rsize; 318 size_t rsize = NFS_SERVER(inode)->rsize;
340 unsigned int curpage, pgbase; 319 unsigned int pgbase;
320 int result;
321 ssize_t started = 0;
322 struct nfs_read_data *data;
341 323
342 curpage = 0;
343 pgbase = user_addr & ~PAGE_MASK; 324 pgbase = user_addr & ~PAGE_MASK;
344 do { 325 do {
345 struct nfs_read_data *data;
346 size_t bytes; 326 size_t bytes;
347 327
348 bytes = rsize; 328 bytes = rsize;
@@ -353,13 +333,21 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
353 data = list_entry(list->next, struct nfs_read_data, pages); 333 data = list_entry(list->next, struct nfs_read_data, pages);
354 list_del_init(&data->pages); 334 list_del_init(&data->pages);
355 335
336 data->npages = nfs_direct_count_pages(user_addr, bytes);
337 down_read(&current->mm->mmap_sem);
338 result = get_user_pages(current, current->mm, user_addr,
339 data->npages, 1, 0, data->pagevec, NULL);
340 up_read(&current->mm->mmap_sem);
341 if (unlikely(result < data->npages))
342 goto out_err;
343
356 data->inode = inode; 344 data->inode = inode;
357 data->cred = ctx->cred; 345 data->cred = ctx->cred;
358 data->args.fh = NFS_FH(inode); 346 data->args.fh = NFS_FH(inode);
359 data->args.context = ctx; 347 data->args.context = ctx;
360 data->args.offset = pos; 348 data->args.offset = pos;
361 data->args.pgbase = pgbase; 349 data->args.pgbase = pgbase;
362 data->args.pages = &pages[curpage]; 350 data->args.pages = data->pagevec;
363 data->args.count = bytes; 351 data->args.count = bytes;
364 data->res.fattr = &data->fattr; 352 data->res.fattr = &data->fattr;
365 data->res.eof = 0; 353 data->res.eof = 0;
@@ -382,17 +370,36 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
382 bytes, 370 bytes,
383 (unsigned long long)data->args.offset); 371 (unsigned long long)data->args.offset);
384 372
373 started += bytes;
374 user_addr += bytes;
385 pos += bytes; 375 pos += bytes;
386 pgbase += bytes; 376 pgbase += bytes;
387 curpage += pgbase >> PAGE_SHIFT;
388 pgbase &= ~PAGE_MASK; 377 pgbase &= ~PAGE_MASK;
389 378
390 count -= bytes; 379 count -= bytes;
391 } while (count != 0); 380 } while (count != 0);
392 BUG_ON(!list_empty(list)); 381 BUG_ON(!list_empty(list));
382 return 0;
383
384out_err:
385 if (result > 0)
386 nfs_direct_release_pages(data->pagevec, result);
387
388 list_add(&data->pages, list);
389 while (!list_empty(list)) {
390 data = list_entry(list->next, struct nfs_read_data, pages);
391 list_del(&data->pages);
392 nfs_readdata_free(data);
393 if (put_dreq(dreq))
394 nfs_direct_complete(dreq);
395 }
396
397 if (started)
398 return 0;
399 return result < 0 ? (ssize_t) result : -EFAULT;
393} 400}
394 401
395static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages) 402static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
396{ 403{
397 ssize_t result; 404 ssize_t result;
398 sigset_t oldset; 405 sigset_t oldset;
@@ -404,8 +411,6 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
404 if (!dreq) 411 if (!dreq)
405 return -ENOMEM; 412 return -ENOMEM;
406 413
407 dreq->pages = pages;
408 dreq->npages = nr_pages;
409 dreq->inode = inode; 414 dreq->inode = inode;
410 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 415 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
411 if (!is_sync_kiocb(iocb)) 416 if (!is_sync_kiocb(iocb))
@@ -413,8 +418,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
413 418
414 nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); 419 nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
415 rpc_clnt_sigmask(clnt, &oldset); 420 rpc_clnt_sigmask(clnt, &oldset);
416 nfs_direct_read_schedule(dreq, user_addr, count, pos); 421 result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
417 result = nfs_direct_wait(dreq); 422 if (!result)
423 result = nfs_direct_wait(dreq);
418 rpc_clnt_sigunmask(clnt, &oldset); 424 rpc_clnt_sigunmask(clnt, &oldset);
419 425
420 return result; 426 return result;
@@ -426,9 +432,9 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
426 while (!list_empty(&dreq->list)) { 432 while (!list_empty(&dreq->list)) {
427 struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages); 433 struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
428 list_del(&data->pages); 434 list_del(&data->pages);
435 nfs_direct_release_pages(data->pagevec, data->npages);
429 nfs_writedata_release(data); 436 nfs_writedata_release(data);
430 } 437 }
431 nfs_direct_release_pages(dreq->pages, dreq->npages);
432} 438}
433 439
434#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 440#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -672,21 +678,23 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
672 678
673/* 679/*
674 * For each nfs_write_data struct that was allocated on the list, dispatch 680 * For each nfs_write_data struct that was allocated on the list, dispatch
675 * an NFS WRITE operation 681 * an NFS WRITE operation. If get_user_pages() fails, we stop sending writes.
682 * Write length accounting is handled by nfs_direct_write_result().
683 * Otherwise, if no requests have been sent, just return an error.
676 */ 684 */
677static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync) 685static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
678{ 686{
679 struct nfs_open_context *ctx = dreq->ctx; 687 struct nfs_open_context *ctx = dreq->ctx;
680 struct inode *inode = ctx->dentry->d_inode; 688 struct inode *inode = ctx->dentry->d_inode;
681 struct list_head *list = &dreq->list; 689 struct list_head *list = &dreq->list;
682 struct page **pages = dreq->pages;
683 size_t wsize = NFS_SERVER(inode)->wsize; 690 size_t wsize = NFS_SERVER(inode)->wsize;
684 unsigned int curpage, pgbase; 691 unsigned int pgbase;
692 int result;
693 ssize_t started = 0;
694 struct nfs_write_data *data;
685 695
686 curpage = 0;
687 pgbase = user_addr & ~PAGE_MASK; 696 pgbase = user_addr & ~PAGE_MASK;
688 do { 697 do {
689 struct nfs_write_data *data;
690 size_t bytes; 698 size_t bytes;
691 699
692 bytes = wsize; 700 bytes = wsize;
@@ -695,6 +703,15 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
695 703
696 BUG_ON(list_empty(list)); 704 BUG_ON(list_empty(list));
697 data = list_entry(list->next, struct nfs_write_data, pages); 705 data = list_entry(list->next, struct nfs_write_data, pages);
706
707 data->npages = nfs_direct_count_pages(user_addr, bytes);
708 down_read(&current->mm->mmap_sem);
709 result = get_user_pages(current, current->mm, user_addr,
710 data->npages, 0, 0, data->pagevec, NULL);
711 up_read(&current->mm->mmap_sem);
712 if (unlikely(result < data->npages))
713 goto out_err;
714
698 list_move_tail(&data->pages, &dreq->rewrite_list); 715 list_move_tail(&data->pages, &dreq->rewrite_list);
699 716
700 data->inode = inode; 717 data->inode = inode;
@@ -703,7 +720,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
703 data->args.context = ctx; 720 data->args.context = ctx;
704 data->args.offset = pos; 721 data->args.offset = pos;
705 data->args.pgbase = pgbase; 722 data->args.pgbase = pgbase;
706 data->args.pages = &pages[curpage]; 723 data->args.pages = data->pagevec;
707 data->args.count = bytes; 724 data->args.count = bytes;
708 data->res.fattr = &data->fattr; 725 data->res.fattr = &data->fattr;
709 data->res.count = bytes; 726 data->res.count = bytes;
@@ -727,17 +744,36 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
727 bytes, 744 bytes,
728 (unsigned long long)data->args.offset); 745 (unsigned long long)data->args.offset);
729 746
747 started += bytes;
748 user_addr += bytes;
730 pos += bytes; 749 pos += bytes;
731 pgbase += bytes; 750 pgbase += bytes;
732 curpage += pgbase >> PAGE_SHIFT;
733 pgbase &= ~PAGE_MASK; 751 pgbase &= ~PAGE_MASK;
734 752
735 count -= bytes; 753 count -= bytes;
736 } while (count != 0); 754 } while (count != 0);
737 BUG_ON(!list_empty(list)); 755 BUG_ON(!list_empty(list));
756 return 0;
757
758out_err:
759 if (result > 0)
760 nfs_direct_release_pages(data->pagevec, result);
761
762 list_add(&data->pages, list);
763 while (!list_empty(list)) {
764 data = list_entry(list->next, struct nfs_write_data, pages);
765 list_del(&data->pages);
766 nfs_writedata_free(data);
767 if (put_dreq(dreq))
768 nfs_direct_write_complete(dreq, inode);
769 }
770
771 if (started)
772 return 0;
773 return result < 0 ? (ssize_t) result : -EFAULT;
738} 774}
739 775
740static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages) 776static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
741{ 777{
742 ssize_t result; 778 ssize_t result;
743 sigset_t oldset; 779 sigset_t oldset;
@@ -753,8 +789,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
753 if (dreq->commit_data == NULL || count < wsize) 789 if (dreq->commit_data == NULL || count < wsize)
754 sync = FLUSH_STABLE; 790 sync = FLUSH_STABLE;
755 791
756 dreq->pages = pages;
757 dreq->npages = nr_pages;
758 dreq->inode = inode; 792 dreq->inode = inode;
759 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 793 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
760 if (!is_sync_kiocb(iocb)) 794 if (!is_sync_kiocb(iocb))
@@ -765,8 +799,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
765 nfs_begin_data_update(inode); 799 nfs_begin_data_update(inode);
766 800
767 rpc_clnt_sigmask(clnt, &oldset); 801 rpc_clnt_sigmask(clnt, &oldset);
768 nfs_direct_write_schedule(dreq, user_addr, count, pos, sync); 802 result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
769 result = nfs_direct_wait(dreq); 803 if (!result)
804 result = nfs_direct_wait(dreq);
770 rpc_clnt_sigunmask(clnt, &oldset); 805 rpc_clnt_sigunmask(clnt, &oldset);
771 806
772 return result; 807 return result;
@@ -796,8 +831,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
796ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) 831ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
797{ 832{
798 ssize_t retval = -EINVAL; 833 ssize_t retval = -EINVAL;
799 int page_count;
800 struct page **pages;
801 struct file *file = iocb->ki_filp; 834 struct file *file = iocb->ki_filp;
802 struct address_space *mapping = file->f_mapping; 835 struct address_space *mapping = file->f_mapping;
803 836
@@ -819,14 +852,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
819 if (retval) 852 if (retval)
820 goto out; 853 goto out;
821 854
822 retval = nfs_get_user_pages(READ, (unsigned long) buf, 855 retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
823 count, &pages);
824 if (retval < 0)
825 goto out;
826 page_count = retval;
827
828 retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
829 pages, page_count);
830 if (retval > 0) 856 if (retval > 0)
831 iocb->ki_pos = pos + retval; 857 iocb->ki_pos = pos + retval;
832 858
@@ -862,8 +888,6 @@ out:
862ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) 888ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
863{ 889{
864 ssize_t retval; 890 ssize_t retval;
865 int page_count;
866 struct page **pages;
867 struct file *file = iocb->ki_filp; 891 struct file *file = iocb->ki_filp;
868 struct address_space *mapping = file->f_mapping; 892 struct address_space *mapping = file->f_mapping;
869 893
@@ -891,14 +915,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
891 if (retval) 915 if (retval)
892 goto out; 916 goto out;
893 917
894 retval = nfs_get_user_pages(WRITE, (unsigned long) buf, 918 retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
895 count, &pages);
896 if (retval < 0)
897 goto out;
898 page_count = retval;
899
900 retval = nfs_direct_write(iocb, (unsigned long) buf, count,
901 pos, pages, page_count);
902 919
903 /* 920 /*
904 * XXX: nfs_end_data_update() already ensures this file's 921 * XXX: nfs_end_data_update() already ensures this file's