aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/nfs/blocklayout/blocklayout.c101
-rw-r--r--fs/nfs/callback.c12
-rw-r--r--fs/nfs/client.c18
-rw-r--r--fs/nfs/delegation.c34
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c208
-rw-r--r--fs/nfs/direct.c33
-rw-r--r--fs/nfs/filelayout/filelayout.c298
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c2
-rw-r--r--fs/nfs/inode.c9
-rw-r--r--fs/nfs/internal.h11
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c21
-rw-r--r--fs/nfs/nfs4_fs.h32
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4proc.c248
-rw-r--r--fs/nfs/nfs4state.c69
-rw-r--r--fs/nfs/nfs4trace.h28
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c24
-rw-r--r--fs/nfs/objlayout/objlayout.c81
-rw-r--r--fs/nfs/objlayout/objlayout.h8
-rw-r--r--fs/nfs/pagelist.c276
-rw-r--r--fs/nfs/pnfs.c178
-rw-r--r--fs/nfs/pnfs.h45
-rw-r--r--fs/nfs/proc.c27
-rw-r--r--fs/nfs/read.c54
-rw-r--r--fs/nfs/super.c12
-rw-r--r--fs/nfs/write.c150
-rw-r--r--fs/nfs_common/nfsacl.c5
-rw-r--r--include/linux/nfs_fs.h2
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nfs_page.h19
-rw-r--r--include/linux/nfs_xdr.h34
-rw-r--r--include/linux/sunrpc/auth.h4
-rw-r--r--include/linux/sunrpc/auth_gss.h3
-rw-r--r--include/linux/sunrpc/gss_krb5.h4
-rw-r--r--include/linux/sunrpc/xprtrdma.h2
-rw-r--r--net/sunrpc/addr.c16
-rw-r--r--net/sunrpc/auth.c68
-rw-r--r--net/sunrpc/auth_generic.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c126
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c28
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c20
-rw-r--r--net/sunrpc/auth_null.c2
-rw-r--r--net/sunrpc/clnt.c5
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c83
-rw-r--r--net/sunrpc/xprtrdma/transport.c17
-rw-r--r--net/sunrpc/xprtrdma/verbs.c739
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h61
-rw-r--r--net/sunrpc/xprtsock.c9
54 files changed, 1967 insertions, 1288 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9b431f44fad9..cbb1797149d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -210,8 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
210 SetPageUptodate(bvec->bv_page); 210 SetPageUptodate(bvec->bv_page);
211 211
212 if (err) { 212 if (err) {
213 struct nfs_pgio_data *rdata = par->data; 213 struct nfs_pgio_header *header = par->data;
214 struct nfs_pgio_header *header = rdata->header;
215 214
216 if (!header->pnfs_error) 215 if (!header->pnfs_error)
217 header->pnfs_error = -EIO; 216 header->pnfs_error = -EIO;
@@ -224,43 +223,44 @@ static void bl_end_io_read(struct bio *bio, int err)
224static void bl_read_cleanup(struct work_struct *work) 223static void bl_read_cleanup(struct work_struct *work)
225{ 224{
226 struct rpc_task *task; 225 struct rpc_task *task;
227 struct nfs_pgio_data *rdata; 226 struct nfs_pgio_header *hdr;
228 dprintk("%s enter\n", __func__); 227 dprintk("%s enter\n", __func__);
229 task = container_of(work, struct rpc_task, u.tk_work); 228 task = container_of(work, struct rpc_task, u.tk_work);
230 rdata = container_of(task, struct nfs_pgio_data, task); 229 hdr = container_of(task, struct nfs_pgio_header, task);
231 pnfs_ld_read_done(rdata); 230 pnfs_ld_read_done(hdr);
232} 231}
233 232
234static void 233static void
235bl_end_par_io_read(void *data, int unused) 234bl_end_par_io_read(void *data, int unused)
236{ 235{
237 struct nfs_pgio_data *rdata = data; 236 struct nfs_pgio_header *hdr = data;
238 237
239 rdata->task.tk_status = rdata->header->pnfs_error; 238 hdr->task.tk_status = hdr->pnfs_error;
240 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); 239 INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
241 schedule_work(&rdata->task.u.tk_work); 240 schedule_work(&hdr->task.u.tk_work);
242} 241}
243 242
244static enum pnfs_try_status 243static enum pnfs_try_status
245bl_read_pagelist(struct nfs_pgio_data *rdata) 244bl_read_pagelist(struct nfs_pgio_header *hdr)
246{ 245{
247 struct nfs_pgio_header *header = rdata->header; 246 struct nfs_pgio_header *header = hdr;
248 int i, hole; 247 int i, hole;
249 struct bio *bio = NULL; 248 struct bio *bio = NULL;
250 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 249 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
251 sector_t isect, extent_length = 0; 250 sector_t isect, extent_length = 0;
252 struct parallel_io *par; 251 struct parallel_io *par;
253 loff_t f_offset = rdata->args.offset; 252 loff_t f_offset = hdr->args.offset;
254 size_t bytes_left = rdata->args.count; 253 size_t bytes_left = hdr->args.count;
255 unsigned int pg_offset, pg_len; 254 unsigned int pg_offset, pg_len;
256 struct page **pages = rdata->args.pages; 255 struct page **pages = hdr->args.pages;
257 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 256 int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
258 const bool is_dio = (header->dreq != NULL); 257 const bool is_dio = (header->dreq != NULL);
259 258
260 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 259 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
261 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); 260 hdr->page_array.npages, f_offset,
261 (unsigned int)hdr->args.count);
262 262
263 par = alloc_parallel(rdata); 263 par = alloc_parallel(hdr);
264 if (!par) 264 if (!par)
265 goto use_mds; 265 goto use_mds;
266 par->pnfs_callback = bl_end_par_io_read; 266 par->pnfs_callback = bl_end_par_io_read;
@@ -268,7 +268,7 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
268 268
269 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 269 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
270 /* Code assumes extents are page-aligned */ 270 /* Code assumes extents are page-aligned */
271 for (i = pg_index; i < rdata->pages.npages; i++) { 271 for (i = pg_index; i < hdr->page_array.npages; i++) {
272 if (!extent_length) { 272 if (!extent_length) {
273 /* We've used up the previous extent */ 273 /* We've used up the previous extent */
274 bl_put_extent(be); 274 bl_put_extent(be);
@@ -317,7 +317,8 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
317 struct pnfs_block_extent *be_read; 317 struct pnfs_block_extent *be_read;
318 318
319 be_read = (hole && cow_read) ? cow_read : be; 319 be_read = (hole && cow_read) ? cow_read : be;
320 bio = do_add_page_to_bio(bio, rdata->pages.npages - i, 320 bio = do_add_page_to_bio(bio,
321 hdr->page_array.npages - i,
321 READ, 322 READ,
322 isect, pages[i], be_read, 323 isect, pages[i], be_read,
323 bl_end_io_read, par, 324 bl_end_io_read, par,
@@ -332,10 +333,10 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
332 extent_length -= PAGE_CACHE_SECTORS; 333 extent_length -= PAGE_CACHE_SECTORS;
333 } 334 }
334 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 335 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
335 rdata->res.eof = 1; 336 hdr->res.eof = 1;
336 rdata->res.count = header->inode->i_size - rdata->args.offset; 337 hdr->res.count = header->inode->i_size - hdr->args.offset;
337 } else { 338 } else {
338 rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; 339 hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
339 } 340 }
340out: 341out:
341 bl_put_extent(be); 342 bl_put_extent(be);
@@ -390,8 +391,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
390 } 391 }
391 392
392 if (unlikely(err)) { 393 if (unlikely(err)) {
393 struct nfs_pgio_data *data = par->data; 394 struct nfs_pgio_header *header = par->data;
394 struct nfs_pgio_header *header = data->header;
395 395
396 if (!header->pnfs_error) 396 if (!header->pnfs_error)
397 header->pnfs_error = -EIO; 397 header->pnfs_error = -EIO;
@@ -405,8 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
405{ 405{
406 struct parallel_io *par = bio->bi_private; 406 struct parallel_io *par = bio->bi_private;
407 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 407 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
408 struct nfs_pgio_data *data = par->data; 408 struct nfs_pgio_header *header = par->data;
409 struct nfs_pgio_header *header = data->header;
410 409
411 if (!uptodate) { 410 if (!uptodate) {
412 if (!header->pnfs_error) 411 if (!header->pnfs_error)
@@ -423,32 +422,32 @@ static void bl_end_io_write(struct bio *bio, int err)
423static void bl_write_cleanup(struct work_struct *work) 422static void bl_write_cleanup(struct work_struct *work)
424{ 423{
425 struct rpc_task *task; 424 struct rpc_task *task;
426 struct nfs_pgio_data *wdata; 425 struct nfs_pgio_header *hdr;
427 dprintk("%s enter\n", __func__); 426 dprintk("%s enter\n", __func__);
428 task = container_of(work, struct rpc_task, u.tk_work); 427 task = container_of(work, struct rpc_task, u.tk_work);
429 wdata = container_of(task, struct nfs_pgio_data, task); 428 hdr = container_of(task, struct nfs_pgio_header, task);
430 if (likely(!wdata->header->pnfs_error)) { 429 if (likely(!hdr->pnfs_error)) {
431 /* Marks for LAYOUTCOMMIT */ 430 /* Marks for LAYOUTCOMMIT */
432 mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), 431 mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
433 wdata->args.offset, wdata->args.count); 432 hdr->args.offset, hdr->args.count);
434 } 433 }
435 pnfs_ld_write_done(wdata); 434 pnfs_ld_write_done(hdr);
436} 435}
437 436
438/* Called when last of bios associated with a bl_write_pagelist call finishes */ 437/* Called when last of bios associated with a bl_write_pagelist call finishes */
439static void bl_end_par_io_write(void *data, int num_se) 438static void bl_end_par_io_write(void *data, int num_se)
440{ 439{
441 struct nfs_pgio_data *wdata = data; 440 struct nfs_pgio_header *hdr = data;
442 441
443 if (unlikely(wdata->header->pnfs_error)) { 442 if (unlikely(hdr->pnfs_error)) {
444 bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, 443 bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
445 num_se); 444 num_se);
446 } 445 }
447 446
448 wdata->task.tk_status = wdata->header->pnfs_error; 447 hdr->task.tk_status = hdr->pnfs_error;
449 wdata->verf.committed = NFS_FILE_SYNC; 448 hdr->verf.committed = NFS_FILE_SYNC;
450 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 449 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
451 schedule_work(&wdata->task.u.tk_work); 450 schedule_work(&hdr->task.u.tk_work);
452} 451}
453 452
454/* FIXME STUB - mark intersection of layout and page as bad, so is not 453/* FIXME STUB - mark intersection of layout and page as bad, so is not
@@ -673,18 +672,17 @@ check_page:
673} 672}
674 673
675static enum pnfs_try_status 674static enum pnfs_try_status
676bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) 675bl_write_pagelist(struct nfs_pgio_header *header, int sync)
677{ 676{
678 struct nfs_pgio_header *header = wdata->header;
679 int i, ret, npg_zero, pg_index, last = 0; 677 int i, ret, npg_zero, pg_index, last = 0;
680 struct bio *bio = NULL; 678 struct bio *bio = NULL;
681 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 679 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
682 sector_t isect, last_isect = 0, extent_length = 0; 680 sector_t isect, last_isect = 0, extent_length = 0;
683 struct parallel_io *par = NULL; 681 struct parallel_io *par = NULL;
684 loff_t offset = wdata->args.offset; 682 loff_t offset = header->args.offset;
685 size_t count = wdata->args.count; 683 size_t count = header->args.count;
686 unsigned int pg_offset, pg_len, saved_len; 684 unsigned int pg_offset, pg_len, saved_len;
687 struct page **pages = wdata->args.pages; 685 struct page **pages = header->args.pages;
688 struct page *page; 686 struct page *page;
689 pgoff_t index; 687 pgoff_t index;
690 u64 temp; 688 u64 temp;
@@ -699,11 +697,11 @@ bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
699 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); 697 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
700 goto out_mds; 698 goto out_mds;
701 } 699 }
702 /* At this point, wdata->pages is a (sequential) list of nfs_pages. 700 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
703 * We want to write each, and if there is an error set pnfs_error 701 * We want to write each, and if there is an error set pnfs_error
704 * to have it redone using nfs. 702 * to have it redone using nfs.
705 */ 703 */
706 par = alloc_parallel(wdata); 704 par = alloc_parallel(header);
707 if (!par) 705 if (!par)
708 goto out_mds; 706 goto out_mds;
709 par->pnfs_callback = bl_end_par_io_write; 707 par->pnfs_callback = bl_end_par_io_write;
@@ -790,8 +788,8 @@ next_page:
790 bio = bl_submit_bio(WRITE, bio); 788 bio = bl_submit_bio(WRITE, bio);
791 789
792 /* Middle pages */ 790 /* Middle pages */
793 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; 791 pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
794 for (i = pg_index; i < wdata->pages.npages; i++) { 792 for (i = pg_index; i < header->page_array.npages; i++) {
795 if (!extent_length) { 793 if (!extent_length) {
796 /* We've used up the previous extent */ 794 /* We've used up the previous extent */
797 bl_put_extent(be); 795 bl_put_extent(be);
@@ -862,7 +860,8 @@ next_page:
862 } 860 }
863 861
864 862
865 bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, 863 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
864 WRITE,
866 isect, pages[i], be, 865 isect, pages[i], be,
867 bl_end_io_write, par, 866 bl_end_io_write, par,
868 pg_offset, pg_len); 867 pg_offset, pg_len);
@@ -890,7 +889,7 @@ next_page:
890 } 889 }
891 890
892write_done: 891write_done:
893 wdata->res.count = wdata->args.count; 892 header->res.count = header->args.count;
894out: 893out:
895 bl_put_extent(be); 894 bl_put_extent(be);
896 bl_put_extent(cow_read); 895 bl_put_extent(cow_read);
@@ -1063,7 +1062,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1063 return ERR_PTR(-ENOMEM); 1062 return ERR_PTR(-ENOMEM);
1064 } 1063 }
1065 1064
1066 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); 1065 pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
1067 if (pages == NULL) { 1066 if (pages == NULL) {
1068 kfree(dev); 1067 kfree(dev);
1069 return ERR_PTR(-ENOMEM); 1068 return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed9..54de482143cc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -428,6 +428,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
428 if (p == NULL) 428 if (p == NULL)
429 return 0; 429 return 0;
430 430
431 /*
432 * Did we get the acceptor from userland during the SETCLIENID
433 * negotiation?
434 */
435 if (clp->cl_acceptor)
436 return !strcmp(p, clp->cl_acceptor);
437
438 /*
439 * Otherwise try to verify it using the cl_hostname. Note that this
440 * doesn't work if a non-canonical hostname was used in the devname.
441 */
442
431 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ 443 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
432 444
433 if (memcmp(p, "nfs@", 4) != 0) 445 if (memcmp(p, "nfs@", 4) != 0)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 180d1ec9c32e..1c5ff6d58385 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
110 mutex_unlock(&nfs_version_mutex); 110 mutex_unlock(&nfs_version_mutex);
111 } 111 }
112 112
113 if (!IS_ERR(nfs)) 113 if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
114 try_module_get(nfs->owner); 114 return ERR_PTR(-EAGAIN);
115 return nfs; 115 return nfs;
116} 116}
117 117
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
158 goto error_0; 158 goto error_0;
159 159
160 clp->cl_nfs_mod = cl_init->nfs_mod; 160 clp->cl_nfs_mod = cl_init->nfs_mod;
161 try_module_get(clp->cl_nfs_mod->owner); 161 if (!try_module_get(clp->cl_nfs_mod->owner))
162 goto error_dealloc;
162 163
163 clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; 164 clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
164 165
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
190 191
191error_cleanup: 192error_cleanup:
192 put_nfs_version(clp->cl_nfs_mod); 193 put_nfs_version(clp->cl_nfs_mod);
194error_dealloc:
193 kfree(clp); 195 kfree(clp);
194error_0: 196error_0:
195 return ERR_PTR(err); 197 return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
252 put_net(clp->cl_net); 254 put_net(clp->cl_net);
253 put_nfs_version(clp->cl_nfs_mod); 255 put_nfs_version(clp->cl_nfs_mod);
254 kfree(clp->cl_hostname); 256 kfree(clp->cl_hostname);
257 kfree(clp->cl_acceptor);
255 kfree(clp); 258 kfree(clp);
256 259
257 dprintk("<-- nfs_free_client()\n"); 260 dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
482 struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); 485 struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
483 const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; 486 const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
484 487
488 if (cl_init->hostname == NULL) {
489 WARN_ON(1);
490 return NULL;
491 }
492
485 dprintk("--> nfs_get_client(%s,v%u)\n", 493 dprintk("--> nfs_get_client(%s,v%u)\n",
486 cl_init->hostname ?: "", rpc_ops->version); 494 cl_init->hostname, rpc_ops->version);
487 495
488 /* see if the client already exists */ 496 /* see if the client already exists */
489 do { 497 do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
510 } while (!IS_ERR(new)); 518 } while (!IS_ERR(new));
511 519
512 dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", 520 dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
513 cl_init->hostname ?: "", PTR_ERR(new)); 521 cl_init->hostname, PTR_ERR(new));
514 return new; 522 return new;
515} 523}
516EXPORT_SYMBOL_GPL(nfs_get_client); 524EXPORT_SYMBOL_GPL(nfs_get_client);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5c..5853f53db732 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
41 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); 41 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
42} 42}
43 43
44/** 44static int
45 * nfs_have_delegation - check if inode has a delegation 45nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
46 * @inode: inode to check
47 * @flags: delegation types to check for
48 *
49 * Returns one if inode has the indicated delegation, otherwise zero.
50 */
51int nfs4_have_delegation(struct inode *inode, fmode_t flags)
52{ 46{
53 struct nfs_delegation *delegation; 47 struct nfs_delegation *delegation;
54 int ret = 0; 48 int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
58 delegation = rcu_dereference(NFS_I(inode)->delegation); 52 delegation = rcu_dereference(NFS_I(inode)->delegation);
59 if (delegation != NULL && (delegation->type & flags) == flags && 53 if (delegation != NULL && (delegation->type & flags) == flags &&
60 !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { 54 !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
61 nfs_mark_delegation_referenced(delegation); 55 if (mark)
56 nfs_mark_delegation_referenced(delegation);
62 ret = 1; 57 ret = 1;
63 } 58 }
64 rcu_read_unlock(); 59 rcu_read_unlock();
65 return ret; 60 return ret;
66} 61}
62/**
63 * nfs_have_delegation - check if inode has a delegation, mark it
64 * NFS_DELEGATION_REFERENCED if there is one.
65 * @inode: inode to check
66 * @flags: delegation types to check for
67 *
68 * Returns one if inode has the indicated delegation, otherwise zero.
69 */
70int nfs4_have_delegation(struct inode *inode, fmode_t flags)
71{
72 return nfs4_do_check_delegation(inode, flags, true);
73}
74
75/*
76 * nfs4_check_delegation - check if inode has a delegation, do not mark
77 * NFS_DELEGATION_REFERENCED if it has one.
78 */
79int nfs4_check_delegation(struct inode *inode, fmode_t flags)
80{
81 return nfs4_do_check_delegation(inode, flags, false);
82}
67 83
68static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) 84static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
69{ 85{
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6d..5c1cce39297f 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
59 59
60void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); 60void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
61int nfs4_have_delegation(struct inode *inode, fmode_t flags); 61int nfs4_have_delegation(struct inode *inode, fmode_t flags);
62int nfs4_check_delegation(struct inode *inode, fmode_t flags);
62 63
63#endif 64#endif
64 65
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef76127..36d921f0c602 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
988 * A check for whether or not the parent directory has changed. 988 * A check for whether or not the parent directory has changed.
989 * In the case it has, we assume that the dentries are untrustworthy 989 * In the case it has, we assume that the dentries are untrustworthy
990 * and may need to be looked up again. 990 * and may need to be looked up again.
991 * If rcu_walk prevents us from performing a full check, return 0.
991 */ 992 */
992static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) 993static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
994 int rcu_walk)
993{ 995{
996 int ret;
997
994 if (IS_ROOT(dentry)) 998 if (IS_ROOT(dentry))
995 return 1; 999 return 1;
996 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) 1000 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
998 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 1002 if (!nfs_verify_change_attribute(dir, dentry->d_time))
999 return 0; 1003 return 0;
1000 /* Revalidate nfsi->cache_change_attribute before we declare a match */ 1004 /* Revalidate nfsi->cache_change_attribute before we declare a match */
1001 if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) 1005 if (rcu_walk)
1006 ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
1007 else
1008 ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
1009 if (ret < 0)
1002 return 0; 1010 return 0;
1003 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 1011 if (!nfs_verify_change_attribute(dir, dentry->d_time))
1004 return 0; 1012 return 0;
@@ -1042,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
1042out: 1050out:
1043 return (inode->i_nlink == 0) ? -ENOENT : 0; 1051 return (inode->i_nlink == 0) ? -ENOENT : 0;
1044out_force: 1052out_force:
1053 if (flags & LOOKUP_RCU)
1054 return -ECHILD;
1045 ret = __nfs_revalidate_inode(server, inode); 1055 ret = __nfs_revalidate_inode(server, inode);
1046 if (ret != 0) 1056 if (ret != 0)
1047 return ret; 1057 return ret;
@@ -1054,6 +1064,9 @@ out_force:
1054 * 1064 *
1055 * If parent mtime has changed, we revalidate, else we wait for a 1065 * If parent mtime has changed, we revalidate, else we wait for a
1056 * period corresponding to the parent's attribute cache timeout value. 1066 * period corresponding to the parent's attribute cache timeout value.
1067 *
1068 * If LOOKUP_RCU prevents us from performing a full check, return 1
1069 * suggesting a reval is needed.
1057 */ 1070 */
1058static inline 1071static inline
1059int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, 1072int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1077,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
1064 return 0; 1077 return 0;
1065 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) 1078 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
1066 return 1; 1079 return 1;
1067 return !nfs_check_verifier(dir, dentry); 1080 return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
1068} 1081}
1069 1082
1070/* 1083/*
@@ -1088,21 +1101,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1088 struct nfs4_label *label = NULL; 1101 struct nfs4_label *label = NULL;
1089 int error; 1102 int error;
1090 1103
1091 if (flags & LOOKUP_RCU) 1104 if (flags & LOOKUP_RCU) {
1092 return -ECHILD; 1105 parent = ACCESS_ONCE(dentry->d_parent);
1093 1106 dir = ACCESS_ONCE(parent->d_inode);
1094 parent = dget_parent(dentry); 1107 if (!dir)
1095 dir = parent->d_inode; 1108 return -ECHILD;
1109 } else {
1110 parent = dget_parent(dentry);
1111 dir = parent->d_inode;
1112 }
1096 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 1113 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
1097 inode = dentry->d_inode; 1114 inode = dentry->d_inode;
1098 1115
1099 if (!inode) { 1116 if (!inode) {
1100 if (nfs_neg_need_reval(dir, dentry, flags)) 1117 if (nfs_neg_need_reval(dir, dentry, flags)) {
1118 if (flags & LOOKUP_RCU)
1119 return -ECHILD;
1101 goto out_bad; 1120 goto out_bad;
1121 }
1102 goto out_valid_noent; 1122 goto out_valid_noent;
1103 } 1123 }
1104 1124
1105 if (is_bad_inode(inode)) { 1125 if (is_bad_inode(inode)) {
1126 if (flags & LOOKUP_RCU)
1127 return -ECHILD;
1106 dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", 1128 dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
1107 __func__, dentry); 1129 __func__, dentry);
1108 goto out_bad; 1130 goto out_bad;
@@ -1112,12 +1134,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1112 goto out_set_verifier; 1134 goto out_set_verifier;
1113 1135
1114 /* Force a full look up iff the parent directory has changed */ 1136 /* Force a full look up iff the parent directory has changed */
1115 if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { 1137 if (!nfs_is_exclusive_create(dir, flags) &&
1116 if (nfs_lookup_verify_inode(inode, flags)) 1138 nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
1139
1140 if (nfs_lookup_verify_inode(inode, flags)) {
1141 if (flags & LOOKUP_RCU)
1142 return -ECHILD;
1117 goto out_zap_parent; 1143 goto out_zap_parent;
1144 }
1118 goto out_valid; 1145 goto out_valid;
1119 } 1146 }
1120 1147
1148 if (flags & LOOKUP_RCU)
1149 return -ECHILD;
1150
1121 if (NFS_STALE(inode)) 1151 if (NFS_STALE(inode))
1122 goto out_bad; 1152 goto out_bad;
1123 1153
@@ -1153,13 +1183,18 @@ out_set_verifier:
1153 /* Success: notify readdir to use READDIRPLUS */ 1183 /* Success: notify readdir to use READDIRPLUS */
1154 nfs_advise_use_readdirplus(dir); 1184 nfs_advise_use_readdirplus(dir);
1155 out_valid_noent: 1185 out_valid_noent:
1156 dput(parent); 1186 if (flags & LOOKUP_RCU) {
1187 if (parent != ACCESS_ONCE(dentry->d_parent))
1188 return -ECHILD;
1189 } else
1190 dput(parent);
1157 dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", 1191 dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
1158 __func__, dentry); 1192 __func__, dentry);
1159 return 1; 1193 return 1;
1160out_zap_parent: 1194out_zap_parent:
1161 nfs_zap_caches(dir); 1195 nfs_zap_caches(dir);
1162 out_bad: 1196 out_bad:
1197 WARN_ON(flags & LOOKUP_RCU);
1163 nfs_free_fattr(fattr); 1198 nfs_free_fattr(fattr);
1164 nfs_free_fhandle(fhandle); 1199 nfs_free_fhandle(fhandle);
1165 nfs4_label_free(label); 1200 nfs4_label_free(label);
@@ -1185,6 +1220,7 @@ out_zap_parent:
1185 __func__, dentry); 1220 __func__, dentry);
1186 return 0; 1221 return 0;
1187out_error: 1222out_error:
1223 WARN_ON(flags & LOOKUP_RCU);
1188 nfs_free_fattr(fattr); 1224 nfs_free_fattr(fattr);
1189 nfs_free_fhandle(fhandle); 1225 nfs_free_fhandle(fhandle);
1190 nfs4_label_free(label); 1226 nfs4_label_free(label);
@@ -1529,14 +1565,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
1529 1565
1530static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) 1566static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1531{ 1567{
1532 struct dentry *parent = NULL;
1533 struct inode *inode; 1568 struct inode *inode;
1534 struct inode *dir;
1535 int ret = 0; 1569 int ret = 0;
1536 1570
1537 if (flags & LOOKUP_RCU)
1538 return -ECHILD;
1539
1540 if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) 1571 if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
1541 goto no_open; 1572 goto no_open;
1542 if (d_mountpoint(dentry)) 1573 if (d_mountpoint(dentry))
@@ -1545,34 +1576,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1545 goto no_open; 1576 goto no_open;
1546 1577
1547 inode = dentry->d_inode; 1578 inode = dentry->d_inode;
1548 parent = dget_parent(dentry);
1549 dir = parent->d_inode;
1550 1579
1551 /* We can't create new files in nfs_open_revalidate(), so we 1580 /* We can't create new files in nfs_open_revalidate(), so we
1552 * optimize away revalidation of negative dentries. 1581 * optimize away revalidation of negative dentries.
1553 */ 1582 */
1554 if (inode == NULL) { 1583 if (inode == NULL) {
1584 struct dentry *parent;
1585 struct inode *dir;
1586
1587 if (flags & LOOKUP_RCU) {
1588 parent = ACCESS_ONCE(dentry->d_parent);
1589 dir = ACCESS_ONCE(parent->d_inode);
1590 if (!dir)
1591 return -ECHILD;
1592 } else {
1593 parent = dget_parent(dentry);
1594 dir = parent->d_inode;
1595 }
1555 if (!nfs_neg_need_reval(dir, dentry, flags)) 1596 if (!nfs_neg_need_reval(dir, dentry, flags))
1556 ret = 1; 1597 ret = 1;
1598 else if (flags & LOOKUP_RCU)
1599 ret = -ECHILD;
1600 if (!(flags & LOOKUP_RCU))
1601 dput(parent);
1602 else if (parent != ACCESS_ONCE(dentry->d_parent))
1603 return -ECHILD;
1557 goto out; 1604 goto out;
1558 } 1605 }
1559 1606
1560 /* NFS only supports OPEN on regular files */ 1607 /* NFS only supports OPEN on regular files */
1561 if (!S_ISREG(inode->i_mode)) 1608 if (!S_ISREG(inode->i_mode))
1562 goto no_open_dput; 1609 goto no_open;
1563 /* We cannot do exclusive creation on a positive dentry */ 1610 /* We cannot do exclusive creation on a positive dentry */
1564 if (flags & LOOKUP_EXCL) 1611 if (flags & LOOKUP_EXCL)
1565 goto no_open_dput; 1612 goto no_open;
1566 1613
1567 /* Let f_op->open() actually open (and revalidate) the file */ 1614 /* Let f_op->open() actually open (and revalidate) the file */
1568 ret = 1; 1615 ret = 1;
1569 1616
1570out: 1617out:
1571 dput(parent);
1572 return ret; 1618 return ret;
1573 1619
1574no_open_dput:
1575 dput(parent);
1576no_open: 1620no_open:
1577 return nfs_lookup_revalidate(dentry, flags); 1621 return nfs_lookup_revalidate(dentry, flags);
1578} 1622}
@@ -2028,10 +2072,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
2028static LIST_HEAD(nfs_access_lru_list); 2072static LIST_HEAD(nfs_access_lru_list);
2029static atomic_long_t nfs_access_nr_entries; 2073static atomic_long_t nfs_access_nr_entries;
2030 2074
2075static unsigned long nfs_access_max_cachesize = ULONG_MAX;
2076module_param(nfs_access_max_cachesize, ulong, 0644);
2077MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
2078
2031static void nfs_access_free_entry(struct nfs_access_entry *entry) 2079static void nfs_access_free_entry(struct nfs_access_entry *entry)
2032{ 2080{
2033 put_rpccred(entry->cred); 2081 put_rpccred(entry->cred);
2034 kfree(entry); 2082 kfree_rcu(entry, rcu_head);
2035 smp_mb__before_atomic(); 2083 smp_mb__before_atomic();
2036 atomic_long_dec(&nfs_access_nr_entries); 2084 atomic_long_dec(&nfs_access_nr_entries);
2037 smp_mb__after_atomic(); 2085 smp_mb__after_atomic();
@@ -2048,19 +2096,14 @@ static void nfs_access_free_list(struct list_head *head)
2048 } 2096 }
2049} 2097}
2050 2098
2051unsigned long 2099static unsigned long
2052nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) 2100nfs_do_access_cache_scan(unsigned int nr_to_scan)
2053{ 2101{
2054 LIST_HEAD(head); 2102 LIST_HEAD(head);
2055 struct nfs_inode *nfsi, *next; 2103 struct nfs_inode *nfsi, *next;
2056 struct nfs_access_entry *cache; 2104 struct nfs_access_entry *cache;
2057 int nr_to_scan = sc->nr_to_scan;
2058 gfp_t gfp_mask = sc->gfp_mask;
2059 long freed = 0; 2105 long freed = 0;
2060 2106
2061 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
2062 return SHRINK_STOP;
2063
2064 spin_lock(&nfs_access_lru_lock); 2107 spin_lock(&nfs_access_lru_lock);
2065 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { 2108 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
2066 struct inode *inode; 2109 struct inode *inode;
@@ -2094,11 +2137,39 @@ remove_lru_entry:
2094} 2137}
2095 2138
2096unsigned long 2139unsigned long
2140nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
2141{
2142 int nr_to_scan = sc->nr_to_scan;
2143 gfp_t gfp_mask = sc->gfp_mask;
2144
2145 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
2146 return SHRINK_STOP;
2147 return nfs_do_access_cache_scan(nr_to_scan);
2148}
2149
2150
2151unsigned long
2097nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) 2152nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
2098{ 2153{
2099 return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); 2154 return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
2100} 2155}
2101 2156
2157static void
2158nfs_access_cache_enforce_limit(void)
2159{
2160 long nr_entries = atomic_long_read(&nfs_access_nr_entries);
2161 unsigned long diff;
2162 unsigned int nr_to_scan;
2163
2164 if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
2165 return;
2166 nr_to_scan = 100;
2167 diff = nr_entries - nfs_access_max_cachesize;
2168 if (diff < nr_to_scan)
2169 nr_to_scan = diff;
2170 nfs_do_access_cache_scan(nr_to_scan);
2171}
2172
2102static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) 2173static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
2103{ 2174{
2104 struct rb_root *root_node = &nfsi->access_cache; 2175 struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2257,38 @@ out_zap:
2186 return -ENOENT; 2257 return -ENOENT;
2187} 2258}
2188 2259
2260static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
2261{
2262 /* Only check the most recently returned cache entry,
2263 * but do it without locking.
2264 */
2265 struct nfs_inode *nfsi = NFS_I(inode);
2266 struct nfs_access_entry *cache;
2267 int err = -ECHILD;
2268 struct list_head *lh;
2269
2270 rcu_read_lock();
2271 if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
2272 goto out;
2273 lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
2274 cache = list_entry(lh, struct nfs_access_entry, lru);
2275 if (lh == &nfsi->access_cache_entry_lru ||
2276 cred != cache->cred)
2277 cache = NULL;
2278 if (cache == NULL)
2279 goto out;
2280 if (!nfs_have_delegated_attributes(inode) &&
2281 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
2282 goto out;
2283 res->jiffies = cache->jiffies;
2284 res->cred = cache->cred;
2285 res->mask = cache->mask;
2286 err = 0;
2287out:
2288 rcu_read_unlock();
2289 return err;
2290}
2291
2189static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) 2292static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
2190{ 2293{
2191 struct nfs_inode *nfsi = NFS_I(inode); 2294 struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2332,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2229 cache->cred = get_rpccred(set->cred); 2332 cache->cred = get_rpccred(set->cred);
2230 cache->mask = set->mask; 2333 cache->mask = set->mask;
2231 2334
2335 /* The above field assignments must be visible
2336 * before this item appears on the lru. We cannot easily
2337 * use rcu_assign_pointer, so just force the memory barrier.
2338 */
2339 smp_wmb();
2232 nfs_access_add_rbtree(inode, cache); 2340 nfs_access_add_rbtree(inode, cache);
2233 2341
2234 /* Update accounting */ 2342 /* Update accounting */
@@ -2244,6 +2352,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2244 &nfs_access_lru_list); 2352 &nfs_access_lru_list);
2245 spin_unlock(&nfs_access_lru_lock); 2353 spin_unlock(&nfs_access_lru_lock);
2246 } 2354 }
2355 nfs_access_cache_enforce_limit();
2247} 2356}
2248EXPORT_SYMBOL_GPL(nfs_access_add_cache); 2357EXPORT_SYMBOL_GPL(nfs_access_add_cache);
2249 2358
@@ -2267,10 +2376,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2267 2376
2268 trace_nfs_access_enter(inode); 2377 trace_nfs_access_enter(inode);
2269 2378
2270 status = nfs_access_get_cached(inode, cred, &cache); 2379 status = nfs_access_get_cached_rcu(inode, cred, &cache);
2380 if (status != 0)
2381 status = nfs_access_get_cached(inode, cred, &cache);
2271 if (status == 0) 2382 if (status == 0)
2272 goto out_cached; 2383 goto out_cached;
2273 2384
2385 status = -ECHILD;
2386 if (mask & MAY_NOT_BLOCK)
2387 goto out;
2388
2274 /* Be clever: ask server to check for all possible rights */ 2389 /* Be clever: ask server to check for all possible rights */
2275 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; 2390 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
2276 cache.cred = cred; 2391 cache.cred = cred;
@@ -2321,9 +2436,6 @@ int nfs_permission(struct inode *inode, int mask)
2321 struct rpc_cred *cred; 2436 struct rpc_cred *cred;
2322 int res = 0; 2437 int res = 0;
2323 2438
2324 if (mask & MAY_NOT_BLOCK)
2325 return -ECHILD;
2326
2327 nfs_inc_stats(inode, NFSIOS_VFSACCESS); 2439 nfs_inc_stats(inode, NFSIOS_VFSACCESS);
2328 2440
2329 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2441 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2462,23 @@ force_lookup:
2350 if (!NFS_PROTO(inode)->access) 2462 if (!NFS_PROTO(inode)->access)
2351 goto out_notsup; 2463 goto out_notsup;
2352 2464
2353 cred = rpc_lookup_cred(); 2465 /* Always try fast lookups first */
2354 if (!IS_ERR(cred)) { 2466 rcu_read_lock();
2355 res = nfs_do_access(inode, cred, mask); 2467 cred = rpc_lookup_cred_nonblock();
2356 put_rpccred(cred); 2468 if (!IS_ERR(cred))
2357 } else 2469 res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
2470 else
2358 res = PTR_ERR(cred); 2471 res = PTR_ERR(cred);
2472 rcu_read_unlock();
2473 if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
2474 /* Fast lookup failed, try the slow way */
2475 cred = rpc_lookup_cred();
2476 if (!IS_ERR(cred)) {
2477 res = nfs_do_access(inode, cred, mask);
2478 put_rpccred(cred);
2479 } else
2480 res = PTR_ERR(cred);
2481 }
2359out: 2482out:
2360 if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) 2483 if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
2361 res = -EACCES; 2484 res = -EACCES;
@@ -2364,6 +2487,9 @@ out:
2364 inode->i_sb->s_id, inode->i_ino, mask, res); 2487 inode->i_sb->s_id, inode->i_ino, mask, res);
2365 return res; 2488 return res;
2366out_notsup: 2489out_notsup:
2490 if (mask & MAY_NOT_BLOCK)
2491 return -ECHILD;
2492
2367 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 2493 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
2368 if (res == 0) 2494 if (res == 0)
2369 res = generic_permission(inode, mask); 2495 res = generic_permission(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f11b9eed0de1..65ef6e00deee 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
148{ 148{
149 struct nfs_writeverf *verfp; 149 struct nfs_writeverf *verfp;
150 150
151 verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, 151 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
152 hdr->data->ds_idx); 152 hdr->ds_idx);
153 WARN_ON_ONCE(verfp->committed >= 0); 153 WARN_ON_ONCE(verfp->committed >= 0);
154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
155 WARN_ON_ONCE(verfp->committed < 0); 155 WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
169{ 169{
170 struct nfs_writeverf *verfp; 170 struct nfs_writeverf *verfp;
171 171
172 verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, 172 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
173 hdr->data->ds_idx); 173 hdr->ds_idx);
174 if (verfp->committed < 0) { 174 if (verfp->committed < 0) {
175 nfs_direct_set_hdr_verf(dreq, hdr); 175 nfs_direct_set_hdr_verf(dreq, hdr);
176 return 0; 176 return 0;
@@ -715,7 +715,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
715{ 715{
716 struct nfs_direct_req *dreq = hdr->dreq; 716 struct nfs_direct_req *dreq = hdr->dreq;
717 struct nfs_commit_info cinfo; 717 struct nfs_commit_info cinfo;
718 int bit = -1; 718 bool request_commit = false;
719 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 719 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
720 720
721 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) 721 if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +729,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
729 dreq->flags = 0; 729 dreq->flags = 0;
730 dreq->error = hdr->error; 730 dreq->error = hdr->error;
731 } 731 }
732 if (dreq->error != 0) 732 if (dreq->error == 0) {
733 bit = NFS_IOHDR_ERROR;
734 else {
735 dreq->count += hdr->good_bytes; 733 dreq->count += hdr->good_bytes;
736 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { 734 if (nfs_write_need_commit(hdr)) {
737 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
738 bit = NFS_IOHDR_NEED_RESCHED;
739 } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
740 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) 735 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
741 bit = NFS_IOHDR_NEED_RESCHED; 736 request_commit = true;
742 else if (dreq->flags == 0) { 737 else if (dreq->flags == 0) {
743 nfs_direct_set_hdr_verf(dreq, hdr); 738 nfs_direct_set_hdr_verf(dreq, hdr);
744 bit = NFS_IOHDR_NEED_COMMIT; 739 request_commit = true;
745 dreq->flags = NFS_ODIRECT_DO_COMMIT; 740 dreq->flags = NFS_ODIRECT_DO_COMMIT;
746 } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { 741 } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
747 if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { 742 request_commit = true;
743 if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
748 dreq->flags = 744 dreq->flags =
749 NFS_ODIRECT_RESCHED_WRITES; 745 NFS_ODIRECT_RESCHED_WRITES;
750 bit = NFS_IOHDR_NEED_RESCHED;
751 } else
752 bit = NFS_IOHDR_NEED_COMMIT;
753 } 746 }
754 } 747 }
755 } 748 }
@@ -759,9 +752,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
759 752
760 req = nfs_list_entry(hdr->pages.next); 753 req = nfs_list_entry(hdr->pages.next);
761 nfs_list_remove_request(req); 754 nfs_list_remove_request(req);
762 switch (bit) { 755 if (request_commit) {
763 case NFS_IOHDR_NEED_RESCHED:
764 case NFS_IOHDR_NEED_COMMIT:
765 kref_get(&req->wb_kref); 756 kref_get(&req->wb_kref);
766 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 757 nfs_mark_request_commit(req, hdr->lseg, &cinfo);
767 } 758 }
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2eba1c13b7e..1359c4a27393 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
84 BUG(); 84 BUG();
85} 85}
86 86
87static void filelayout_reset_write(struct nfs_pgio_data *data) 87static void filelayout_reset_write(struct nfs_pgio_header *hdr)
88{ 88{
89 struct nfs_pgio_header *hdr = data->header; 89 struct rpc_task *task = &hdr->task;
90 struct rpc_task *task = &data->task;
91 90
92 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 91 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
93 dprintk("%s Reset task %5u for i/o through MDS " 92 dprintk("%s Reset task %5u for i/o through MDS "
94 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, 93 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
95 data->task.tk_pid, 94 hdr->task.tk_pid,
96 hdr->inode->i_sb->s_id, 95 hdr->inode->i_sb->s_id,
97 (unsigned long long)NFS_FILEID(hdr->inode), 96 (unsigned long long)NFS_FILEID(hdr->inode),
98 data->args.count, 97 hdr->args.count,
99 (unsigned long long)data->args.offset); 98 (unsigned long long)hdr->args.offset);
100 99
101 task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 100 task->tk_status = pnfs_write_done_resend_to_mds(hdr);
102 &hdr->pages,
103 hdr->completion_ops,
104 hdr->dreq);
105 } 101 }
106} 102}
107 103
108static void filelayout_reset_read(struct nfs_pgio_data *data) 104static void filelayout_reset_read(struct nfs_pgio_header *hdr)
109{ 105{
110 struct nfs_pgio_header *hdr = data->header; 106 struct rpc_task *task = &hdr->task;
111 struct rpc_task *task = &data->task;
112 107
113 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 108 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
114 dprintk("%s Reset task %5u for i/o through MDS " 109 dprintk("%s Reset task %5u for i/o through MDS "
115 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, 110 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
116 data->task.tk_pid, 111 hdr->task.tk_pid,
117 hdr->inode->i_sb->s_id, 112 hdr->inode->i_sb->s_id,
118 (unsigned long long)NFS_FILEID(hdr->inode), 113 (unsigned long long)NFS_FILEID(hdr->inode),
119 data->args.count, 114 hdr->args.count,
120 (unsigned long long)data->args.offset); 115 (unsigned long long)hdr->args.offset);
121 116
122 task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 117 task->tk_status = pnfs_read_done_resend_to_mds(hdr);
123 &hdr->pages,
124 hdr->completion_ops,
125 hdr->dreq);
126 } 118 }
127} 119}
128 120
@@ -243,18 +235,17 @@ wait_on_recovery:
243/* NFS_PROTO call done callback routines */ 235/* NFS_PROTO call done callback routines */
244 236
245static int filelayout_read_done_cb(struct rpc_task *task, 237static int filelayout_read_done_cb(struct rpc_task *task,
246 struct nfs_pgio_data *data) 238 struct nfs_pgio_header *hdr)
247{ 239{
248 struct nfs_pgio_header *hdr = data->header;
249 int err; 240 int err;
250 241
251 trace_nfs4_pnfs_read(data, task->tk_status); 242 trace_nfs4_pnfs_read(hdr, task->tk_status);
252 err = filelayout_async_handle_error(task, data->args.context->state, 243 err = filelayout_async_handle_error(task, hdr->args.context->state,
253 data->ds_clp, hdr->lseg); 244 hdr->ds_clp, hdr->lseg);
254 245
255 switch (err) { 246 switch (err) {
256 case -NFS4ERR_RESET_TO_MDS: 247 case -NFS4ERR_RESET_TO_MDS:
257 filelayout_reset_read(data); 248 filelayout_reset_read(hdr);
258 return task->tk_status; 249 return task->tk_status;
259 case -EAGAIN: 250 case -EAGAIN:
260 rpc_restart_call_prepare(task); 251 rpc_restart_call_prepare(task);
@@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
270 * rfc5661 is not clear about which credential should be used. 261 * rfc5661 is not clear about which credential should be used.
271 */ 262 */
272static void 263static void
273filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) 264filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
274{ 265{
275 struct nfs_pgio_header *hdr = wdata->header;
276 266
277 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
278 wdata->res.verf->committed == NFS_FILE_SYNC) 268 hdr->res.verf->committed == NFS_FILE_SYNC)
279 return; 269 return;
280 270
281 pnfs_set_layoutcommit(wdata); 271 pnfs_set_layoutcommit(hdr);
282 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 272 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
283 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 273 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
284} 274}
@@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
305 */ 295 */
306static void filelayout_read_prepare(struct rpc_task *task, void *data) 296static void filelayout_read_prepare(struct rpc_task *task, void *data)
307{ 297{
308 struct nfs_pgio_data *rdata = data; 298 struct nfs_pgio_header *hdr = data;
309 299
310 if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { 300 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
311 rpc_exit(task, -EIO); 301 rpc_exit(task, -EIO);
312 return; 302 return;
313 } 303 }
314 if (filelayout_reset_to_mds(rdata->header->lseg)) { 304 if (filelayout_reset_to_mds(hdr->lseg)) {
315 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); 305 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
316 filelayout_reset_read(rdata); 306 filelayout_reset_read(hdr);
317 rpc_exit(task, 0); 307 rpc_exit(task, 0);
318 return; 308 return;
319 } 309 }
320 rdata->pgio_done_cb = filelayout_read_done_cb; 310 hdr->pgio_done_cb = filelayout_read_done_cb;
321 311
322 if (nfs41_setup_sequence(rdata->ds_clp->cl_session, 312 if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
323 &rdata->args.seq_args, 313 &hdr->args.seq_args,
324 &rdata->res.seq_res, 314 &hdr->res.seq_res,
325 task)) 315 task))
326 return; 316 return;
327 if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, 317 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
328 rdata->args.lock_context, FMODE_READ) == -EIO) 318 hdr->args.lock_context, FMODE_READ) == -EIO)
329 rpc_exit(task, -EIO); /* lost lock, terminate I/O */ 319 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
330} 320}
331 321
332static void filelayout_read_call_done(struct rpc_task *task, void *data) 322static void filelayout_read_call_done(struct rpc_task *task, void *data)
333{ 323{
334 struct nfs_pgio_data *rdata = data; 324 struct nfs_pgio_header *hdr = data;
335 325
336 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); 326 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
337 327
338 if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && 328 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
339 task->tk_status == 0) { 329 task->tk_status == 0) {
340 nfs41_sequence_done(task, &rdata->res.seq_res); 330 nfs41_sequence_done(task, &hdr->res.seq_res);
341 return; 331 return;
342 } 332 }
343 333
344 /* Note this may cause RPC to be resent */ 334 /* Note this may cause RPC to be resent */
345 rdata->header->mds_ops->rpc_call_done(task, data); 335 hdr->mds_ops->rpc_call_done(task, data);
346} 336}
347 337
348static void filelayout_read_count_stats(struct rpc_task *task, void *data) 338static void filelayout_read_count_stats(struct rpc_task *task, void *data)
349{ 339{
350 struct nfs_pgio_data *rdata = data; 340 struct nfs_pgio_header *hdr = data;
351 341
352 rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); 342 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
353} 343}
354 344
355static void filelayout_read_release(void *data) 345static void filelayout_read_release(void *data)
356{ 346{
357 struct nfs_pgio_data *rdata = data; 347 struct nfs_pgio_header *hdr = data;
358 struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; 348 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
359 349
360 filelayout_fenceme(lo->plh_inode, lo); 350 filelayout_fenceme(lo->plh_inode, lo);
361 nfs_put_client(rdata->ds_clp); 351 nfs_put_client(hdr->ds_clp);
362 rdata->header->mds_ops->rpc_release(data); 352 hdr->mds_ops->rpc_release(data);
363} 353}
364 354
365static int filelayout_write_done_cb(struct rpc_task *task, 355static int filelayout_write_done_cb(struct rpc_task *task,
366 struct nfs_pgio_data *data) 356 struct nfs_pgio_header *hdr)
367{ 357{
368 struct nfs_pgio_header *hdr = data->header;
369 int err; 358 int err;
370 359
371 trace_nfs4_pnfs_write(data, task->tk_status); 360 trace_nfs4_pnfs_write(hdr, task->tk_status);
372 err = filelayout_async_handle_error(task, data->args.context->state, 361 err = filelayout_async_handle_error(task, hdr->args.context->state,
373 data->ds_clp, hdr->lseg); 362 hdr->ds_clp, hdr->lseg);
374 363
375 switch (err) { 364 switch (err) {
376 case -NFS4ERR_RESET_TO_MDS: 365 case -NFS4ERR_RESET_TO_MDS:
377 filelayout_reset_write(data); 366 filelayout_reset_write(hdr);
378 return task->tk_status; 367 return task->tk_status;
379 case -EAGAIN: 368 case -EAGAIN:
380 rpc_restart_call_prepare(task); 369 rpc_restart_call_prepare(task);
381 return -EAGAIN; 370 return -EAGAIN;
382 } 371 }
383 372
384 filelayout_set_layoutcommit(data); 373 filelayout_set_layoutcommit(hdr);
385 return 0; 374 return 0;
386} 375}
387 376
@@ -419,57 +408,57 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
419 408
420static void filelayout_write_prepare(struct rpc_task *task, void *data) 409static void filelayout_write_prepare(struct rpc_task *task, void *data)
421{ 410{
422 struct nfs_pgio_data *wdata = data; 411 struct nfs_pgio_header *hdr = data;
423 412
424 if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { 413 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
425 rpc_exit(task, -EIO); 414 rpc_exit(task, -EIO);
426 return; 415 return;
427 } 416 }
428 if (filelayout_reset_to_mds(wdata->header->lseg)) { 417 if (filelayout_reset_to_mds(hdr->lseg)) {
429 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); 418 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
430 filelayout_reset_write(wdata); 419 filelayout_reset_write(hdr);
431 rpc_exit(task, 0); 420 rpc_exit(task, 0);
432 return; 421 return;
433 } 422 }
434 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 423 if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
435 &wdata->args.seq_args, 424 &hdr->args.seq_args,
436 &wdata->res.seq_res, 425 &hdr->res.seq_res,
437 task)) 426 task))
438 return; 427 return;
439 if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, 428 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
440 wdata->args.lock_context, FMODE_WRITE) == -EIO) 429 hdr->args.lock_context, FMODE_WRITE) == -EIO)
441 rpc_exit(task, -EIO); /* lost lock, terminate I/O */ 430 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
442} 431}
443 432
444static void filelayout_write_call_done(struct rpc_task *task, void *data) 433static void filelayout_write_call_done(struct rpc_task *task, void *data)
445{ 434{
446 struct nfs_pgio_data *wdata = data; 435 struct nfs_pgio_header *hdr = data;
447 436
448 if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && 437 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
449 task->tk_status == 0) { 438 task->tk_status == 0) {
450 nfs41_sequence_done(task, &wdata->res.seq_res); 439 nfs41_sequence_done(task, &hdr->res.seq_res);
451 return; 440 return;
452 } 441 }
453 442
454 /* Note this may cause RPC to be resent */ 443 /* Note this may cause RPC to be resent */
455 wdata->header->mds_ops->rpc_call_done(task, data); 444 hdr->mds_ops->rpc_call_done(task, data);
456} 445}
457 446
458static void filelayout_write_count_stats(struct rpc_task *task, void *data) 447static void filelayout_write_count_stats(struct rpc_task *task, void *data)
459{ 448{
460 struct nfs_pgio_data *wdata = data; 449 struct nfs_pgio_header *hdr = data;
461 450
462 rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); 451 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
463} 452}
464 453
465static void filelayout_write_release(void *data) 454static void filelayout_write_release(void *data)
466{ 455{
467 struct nfs_pgio_data *wdata = data; 456 struct nfs_pgio_header *hdr = data;
468 struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; 457 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
469 458
470 filelayout_fenceme(lo->plh_inode, lo); 459 filelayout_fenceme(lo->plh_inode, lo);
471 nfs_put_client(wdata->ds_clp); 460 nfs_put_client(hdr->ds_clp);
472 wdata->header->mds_ops->rpc_release(data); 461 hdr->mds_ops->rpc_release(data);
473} 462}
474 463
475static void filelayout_commit_prepare(struct rpc_task *task, void *data) 464static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
529}; 518};
530 519
531static enum pnfs_try_status 520static enum pnfs_try_status
532filelayout_read_pagelist(struct nfs_pgio_data *data) 521filelayout_read_pagelist(struct nfs_pgio_header *hdr)
533{ 522{
534 struct nfs_pgio_header *hdr = data->header;
535 struct pnfs_layout_segment *lseg = hdr->lseg; 523 struct pnfs_layout_segment *lseg = hdr->lseg;
536 struct nfs4_pnfs_ds *ds; 524 struct nfs4_pnfs_ds *ds;
537 struct rpc_clnt *ds_clnt; 525 struct rpc_clnt *ds_clnt;
538 loff_t offset = data->args.offset; 526 loff_t offset = hdr->args.offset;
539 u32 j, idx; 527 u32 j, idx;
540 struct nfs_fh *fh; 528 struct nfs_fh *fh;
541 529
542 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", 530 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
543 __func__, hdr->inode->i_ino, 531 __func__, hdr->inode->i_ino,
544 data->args.pgbase, (size_t)data->args.count, offset); 532 hdr->args.pgbase, (size_t)hdr->args.count, offset);
545 533
546 /* Retrieve the correct rpc_client for the byte range */ 534 /* Retrieve the correct rpc_client for the byte range */
547 j = nfs4_fl_calc_j_index(lseg, offset); 535 j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
559 547
560 /* No multipath support. Use first DS */ 548 /* No multipath support. Use first DS */
561 atomic_inc(&ds->ds_clp->cl_count); 549 atomic_inc(&ds->ds_clp->cl_count);
562 data->ds_clp = ds->ds_clp; 550 hdr->ds_clp = ds->ds_clp;
563 data->ds_idx = idx; 551 hdr->ds_idx = idx;
564 fh = nfs4_fl_select_ds_fh(lseg, j); 552 fh = nfs4_fl_select_ds_fh(lseg, j);
565 if (fh) 553 if (fh)
566 data->args.fh = fh; 554 hdr->args.fh = fh;
567 555
568 data->args.offset = filelayout_get_dserver_offset(lseg, offset); 556 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
569 data->mds_offset = offset; 557 hdr->mds_offset = offset;
570 558
571 /* Perform an asynchronous read to ds */ 559 /* Perform an asynchronous read to ds */
572 nfs_initiate_pgio(ds_clnt, data, 560 nfs_initiate_pgio(ds_clnt, hdr,
573 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); 561 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
574 return PNFS_ATTEMPTED; 562 return PNFS_ATTEMPTED;
575} 563}
576 564
577/* Perform async writes. */ 565/* Perform async writes. */
578static enum pnfs_try_status 566static enum pnfs_try_status
579filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) 567filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
580{ 568{
581 struct nfs_pgio_header *hdr = data->header;
582 struct pnfs_layout_segment *lseg = hdr->lseg; 569 struct pnfs_layout_segment *lseg = hdr->lseg;
583 struct nfs4_pnfs_ds *ds; 570 struct nfs4_pnfs_ds *ds;
584 struct rpc_clnt *ds_clnt; 571 struct rpc_clnt *ds_clnt;
585 loff_t offset = data->args.offset; 572 loff_t offset = hdr->args.offset;
586 u32 j, idx; 573 u32 j, idx;
587 struct nfs_fh *fh; 574 struct nfs_fh *fh;
588 575
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
598 return PNFS_NOT_ATTEMPTED; 585 return PNFS_NOT_ATTEMPTED;
599 586
600 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", 587 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
601 __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, 588 __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
602 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); 589 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
603 590
604 data->pgio_done_cb = filelayout_write_done_cb; 591 hdr->pgio_done_cb = filelayout_write_done_cb;
605 atomic_inc(&ds->ds_clp->cl_count); 592 atomic_inc(&ds->ds_clp->cl_count);
606 data->ds_clp = ds->ds_clp; 593 hdr->ds_clp = ds->ds_clp;
607 data->ds_idx = idx; 594 hdr->ds_idx = idx;
608 fh = nfs4_fl_select_ds_fh(lseg, j); 595 fh = nfs4_fl_select_ds_fh(lseg, j);
609 if (fh) 596 if (fh)
610 data->args.fh = fh; 597 hdr->args.fh = fh;
611 598 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
612 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
613 599
614 /* Perform an asynchronous write */ 600 /* Perform an asynchronous write */
615 nfs_initiate_pgio(ds_clnt, data, 601 nfs_initiate_pgio(ds_clnt, hdr,
616 &filelayout_write_call_ops, sync, 602 &filelayout_write_call_ops, sync,
617 RPC_TASK_SOFTCONN); 603 RPC_TASK_SOFTCONN);
618 return PNFS_ATTEMPTED; 604 return PNFS_ATTEMPTED;
@@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
1023 1009
1024/* The generic layer is about to remove the req from the commit list. 1010/* The generic layer is about to remove the req from the commit list.
1025 * If this will make the bucket empty, it will need to put the lseg reference. 1011 * If this will make the bucket empty, it will need to put the lseg reference.
1012 * Note this is must be called holding the inode (/cinfo) lock
1026 */ 1013 */
1027static void 1014static void
1028filelayout_clear_request_commit(struct nfs_page *req, 1015filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
1030{ 1017{
1031 struct pnfs_layout_segment *freeme = NULL; 1018 struct pnfs_layout_segment *freeme = NULL;
1032 1019
1033 spin_lock(cinfo->lock);
1034 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) 1020 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
1035 goto out; 1021 goto out;
1036 cinfo->ds->nwritten--; 1022 cinfo->ds->nwritten--;
@@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
1045 } 1031 }
1046out: 1032out:
1047 nfs_request_remove_commit_list(req, cinfo); 1033 nfs_request_remove_commit_list(req, cinfo);
1048 spin_unlock(cinfo->lock); 1034 pnfs_put_lseg_async(freeme);
1049 pnfs_put_lseg(freeme);
1050} 1035}
1051 1036
1052static struct list_head * 1037static void
1053filelayout_choose_commit_list(struct nfs_page *req, 1038filelayout_mark_request_commit(struct nfs_page *req,
1054 struct pnfs_layout_segment *lseg, 1039 struct pnfs_layout_segment *lseg,
1055 struct nfs_commit_info *cinfo) 1040 struct nfs_commit_info *cinfo)
1041
1056{ 1042{
1057 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 1043 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
1058 u32 i, j; 1044 u32 i, j;
1059 struct list_head *list; 1045 struct list_head *list;
1060 struct pnfs_commit_bucket *buckets; 1046 struct pnfs_commit_bucket *buckets;
1061 1047
1062 if (fl->commit_through_mds) 1048 if (fl->commit_through_mds) {
1063 return &cinfo->mds->list; 1049 list = &cinfo->mds->list;
1050 spin_lock(cinfo->lock);
1051 goto mds_commit;
1052 }
1064 1053
1065 /* Note that we are calling nfs4_fl_calc_j_index on each page 1054 /* Note that we are calling nfs4_fl_calc_j_index on each page
1066 * that ends up being committed to a data server. An attractive 1055 * that ends up being committed to a data server. An attractive
@@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
1084 } 1073 }
1085 set_bit(PG_COMMIT_TO_DS, &req->wb_flags); 1074 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1086 cinfo->ds->nwritten++; 1075 cinfo->ds->nwritten++;
1087 spin_unlock(cinfo->lock);
1088 return list;
1089}
1090 1076
1091static void 1077mds_commit:
1092filelayout_mark_request_commit(struct nfs_page *req, 1078 /* nfs_request_add_commit_list(). We need to add req to list without
1093 struct pnfs_layout_segment *lseg, 1079 * dropping cinfo lock.
1094 struct nfs_commit_info *cinfo) 1080 */
1095{ 1081 set_bit(PG_CLEAN, &(req)->wb_flags);
1096 struct list_head *list; 1082 nfs_list_add_request(req, list);
1097 1083 cinfo->mds->ncommit++;
1098 list = filelayout_choose_commit_list(req, lseg, cinfo); 1084 spin_unlock(cinfo->lock);
1099 nfs_request_add_commit_list(req, list, cinfo); 1085 if (!cinfo->dreq) {
1086 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1087 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
1088 BDI_RECLAIMABLE);
1089 __mark_inode_dirty(req->wb_context->dentry->d_inode,
1090 I_DIRTY_DATASYNC);
1091 }
1100} 1092}
1101 1093
1102static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) 1094static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1236,63 @@ restart:
1244 spin_unlock(cinfo->lock); 1236 spin_unlock(cinfo->lock);
1245} 1237}
1246 1238
1239/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
1240 * for @page
1241 * @cinfo - commit info for current inode
1242 * @page - page to search for matching head request
1243 *
1244 * Returns a the head request if one is found, otherwise returns NULL.
1245 */
1246static struct nfs_page *
1247filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
1248{
1249 struct nfs_page *freq, *t;
1250 struct pnfs_commit_bucket *b;
1251 int i;
1252
1253 /* Linearly search the commit lists for each bucket until a matching
1254 * request is found */
1255 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1256 list_for_each_entry_safe(freq, t, &b->written, wb_list) {
1257 if (freq->wb_page == page)
1258 return freq->wb_head;
1259 }
1260 list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
1261 if (freq->wb_page == page)
1262 return freq->wb_head;
1263 }
1264 }
1265
1266 return NULL;
1267}
1268
1269static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
1270{
1271 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
1272 struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
1273 struct pnfs_layout_segment *freeme;
1274 int i;
1275
1276 for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
1277 if (list_empty(&bucket->committing))
1278 continue;
1279 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1280 spin_lock(cinfo->lock);
1281 freeme = bucket->clseg;
1282 bucket->clseg = NULL;
1283 spin_unlock(cinfo->lock);
1284 pnfs_put_lseg(freeme);
1285 }
1286}
1287
1247static unsigned int 1288static unsigned int
1248alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) 1289alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1249{ 1290{
1250 struct pnfs_ds_commit_info *fl_cinfo; 1291 struct pnfs_ds_commit_info *fl_cinfo;
1251 struct pnfs_commit_bucket *bucket; 1292 struct pnfs_commit_bucket *bucket;
1252 struct nfs_commit_data *data; 1293 struct nfs_commit_data *data;
1253 int i, j; 1294 int i;
1254 unsigned int nreq = 0; 1295 unsigned int nreq = 0;
1255 struct pnfs_layout_segment *freeme;
1256 1296
1257 fl_cinfo = cinfo->ds; 1297 fl_cinfo = cinfo->ds;
1258 bucket = fl_cinfo->buckets; 1298 bucket = fl_cinfo->buckets;
@@ -1272,16 +1312,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1272 } 1312 }
1273 1313
1274 /* Clean up on error */ 1314 /* Clean up on error */
1275 for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { 1315 filelayout_retry_commit(cinfo, i);
1276 if (list_empty(&bucket->committing))
1277 continue;
1278 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1279 spin_lock(cinfo->lock);
1280 freeme = bucket->clseg;
1281 bucket->clseg = NULL;
1282 spin_unlock(cinfo->lock);
1283 pnfs_put_lseg(freeme);
1284 }
1285 /* Caller will clean up entries put on list */ 1316 /* Caller will clean up entries put on list */
1286 return nreq; 1317 return nreq;
1287} 1318}
@@ -1301,8 +1332,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1301 data->lseg = NULL; 1332 data->lseg = NULL;
1302 list_add(&data->pages, &list); 1333 list_add(&data->pages, &list);
1303 nreq++; 1334 nreq++;
1304 } else 1335 } else {
1305 nfs_retry_commit(mds_pages, NULL, cinfo); 1336 nfs_retry_commit(mds_pages, NULL, cinfo);
1337 filelayout_retry_commit(cinfo, 0);
1338 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1339 return -ENOMEM;
1340 }
1306 } 1341 }
1307 1342
1308 nreq += alloc_ds_commits(cinfo, &list); 1343 nreq += alloc_ds_commits(cinfo, &list);
@@ -1380,6 +1415,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1380 .clear_request_commit = filelayout_clear_request_commit, 1415 .clear_request_commit = filelayout_clear_request_commit,
1381 .scan_commit_lists = filelayout_scan_commit_lists, 1416 .scan_commit_lists = filelayout_scan_commit_lists,
1382 .recover_commit_reqs = filelayout_recover_commit_reqs, 1417 .recover_commit_reqs = filelayout_recover_commit_reqs,
1418 .search_commit_reqs = filelayout_search_commit_reqs,
1383 .commit_pagelist = filelayout_commit_pagelist, 1419 .commit_pagelist = filelayout_commit_pagelist,
1384 .read_pagelist = filelayout_read_pagelist, 1420 .read_pagelist = filelayout_read_pagelist,
1385 .write_pagelist = filelayout_write_pagelist, 1421 .write_pagelist = filelayout_write_pagelist,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index e2a0361e24c6..8540516f4d71 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -695,7 +695,7 @@ filelayout_get_device_info(struct inode *inode,
695 if (pdev == NULL) 695 if (pdev == NULL)
696 return NULL; 696 return NULL;
697 697
698 pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); 698 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
699 if (pages == NULL) { 699 if (pages == NULL) {
700 kfree(pdev); 700 kfree(pdev);
701 return NULL; 701 return NULL;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 68921b01b792..577a36f0a510 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1002,6 +1002,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1002} 1002}
1003EXPORT_SYMBOL_GPL(nfs_revalidate_inode); 1003EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
1004 1004
1005int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
1006{
1007 if (!(NFS_I(inode)->cache_validity &
1008 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
1009 && !nfs_attribute_cache_expired(inode))
1010 return NFS_STALE(inode) ? -ESTALE : 0;
1011 return -ECHILD;
1012}
1013
1005static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) 1014static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
1006{ 1015{
1007 struct nfs_inode *nfsi = NFS_I(inode); 1016 struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e2a45ae5014e..9056622d2230 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -247,11 +247,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
247int nfs_iocounter_wait(struct nfs_io_counter *c); 247int nfs_iocounter_wait(struct nfs_io_counter *c);
248 248
249extern const struct nfs_pageio_ops nfs_pgio_rw_ops; 249extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
250struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); 250struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
251void nfs_rw_header_free(struct nfs_pgio_header *); 251void nfs_pgio_header_free(struct nfs_pgio_header *);
252void nfs_pgio_data_release(struct nfs_pgio_data *); 252void nfs_pgio_data_destroy(struct nfs_pgio_header *);
253int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); 253int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
254int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, 254int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
255 const struct rpc_call_ops *, int, int); 255 const struct rpc_call_ops *, int, int);
256void nfs_free_request(struct nfs_page *req); 256void nfs_free_request(struct nfs_page *req);
257 257
@@ -451,6 +451,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
451void nfs_mark_request_commit(struct nfs_page *req, 451void nfs_mark_request_commit(struct nfs_page *req,
452 struct pnfs_layout_segment *lseg, 452 struct pnfs_layout_segment *lseg,
453 struct nfs_commit_info *cinfo); 453 struct nfs_commit_info *cinfo);
454int nfs_write_need_commit(struct nfs_pgio_header *);
454int nfs_generic_commit_list(struct inode *inode, struct list_head *head, 455int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
455 int how, struct nfs_commit_info *cinfo); 456 int how, struct nfs_commit_info *cinfo);
456void nfs_retry_commit(struct list_head *page_list, 457void nfs_retry_commit(struct list_head *page_list,
@@ -491,7 +492,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
491extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); 492extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
492 493
493/* nfs4proc.c */ 494/* nfs4proc.c */
494extern void __nfs4_read_done_cb(struct nfs_pgio_data *); 495extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
495extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, 496extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
496 const struct rpc_timeout *timeparms, 497 const struct rpc_timeout *timeparms,
497 const char *ip_addr); 498 const char *ip_addr);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 8f854dde4150..d0fec260132a 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -256,7 +256,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
256 char *p = data + *result; 256 char *p = data + *result;
257 257
258 acl = get_acl(inode, type); 258 acl = get_acl(inode, type);
259 if (!acl) 259 if (IS_ERR_OR_NULL(acl))
260 return 0; 260 return 0;
261 261
262 posix_acl_release(acl); 262 posix_acl_release(acl);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f0afa291fd58..809670eba52a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -795,41 +795,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
795 return status; 795 return status;
796} 796}
797 797
798static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) 798static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
799{ 799{
800 struct inode *inode = data->header->inode; 800 struct inode *inode = hdr->inode;
801 801
802 if (nfs3_async_handle_jukebox(task, inode)) 802 if (nfs3_async_handle_jukebox(task, inode))
803 return -EAGAIN; 803 return -EAGAIN;
804 804
805 nfs_invalidate_atime(inode); 805 nfs_invalidate_atime(inode);
806 nfs_refresh_inode(inode, &data->fattr); 806 nfs_refresh_inode(inode, &hdr->fattr);
807 return 0; 807 return 0;
808} 808}
809 809
810static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 810static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
811 struct rpc_message *msg)
811{ 812{
812 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; 813 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
813} 814}
814 815
815static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) 816static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
817 struct nfs_pgio_header *hdr)
816{ 818{
817 rpc_call_start(task); 819 rpc_call_start(task);
818 return 0; 820 return 0;
819} 821}
820 822
821static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) 823static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
822{ 824{
823 struct inode *inode = data->header->inode; 825 struct inode *inode = hdr->inode;
824 826
825 if (nfs3_async_handle_jukebox(task, inode)) 827 if (nfs3_async_handle_jukebox(task, inode))
826 return -EAGAIN; 828 return -EAGAIN;
827 if (task->tk_status >= 0) 829 if (task->tk_status >= 0)
828 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 830 nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
829 return 0; 831 return 0;
830} 832}
831 833
832static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 834static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
835 struct rpc_message *msg)
833{ 836{
834 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; 837 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
835} 838}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba2affa51941..92193eddb41d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
54 const nfs4_stateid *); 54 const nfs4_stateid *);
55 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, 55 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
56 struct nfs_fsinfo *); 56 struct nfs_fsinfo *);
57 int (*free_lock_state)(struct nfs_server *, 57 void (*free_lock_state)(struct nfs_server *,
58 struct nfs4_lock_state *); 58 struct nfs4_lock_state *);
59 const struct rpc_call_ops *call_sync_ops; 59 const struct rpc_call_ops *call_sync_ops;
60 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 60 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,27 +129,17 @@ enum {
129 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) 129 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
130 */ 130 */
131 131
132struct nfs4_lock_owner {
133 unsigned int lo_type;
134#define NFS4_ANY_LOCK_TYPE (0U)
135#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
136#define NFS4_POSIX_LOCK_TYPE (1U << 1)
137 union {
138 fl_owner_t posix_owner;
139 pid_t flock_owner;
140 } lo_u;
141};
142
143struct nfs4_lock_state { 132struct nfs4_lock_state {
144 struct list_head ls_locks; /* Other lock stateids */ 133 struct list_head ls_locks; /* Other lock stateids */
145 struct nfs4_state * ls_state; /* Pointer to open state */ 134 struct nfs4_state * ls_state; /* Pointer to open state */
146#define NFS_LOCK_INITIALIZED 0 135#define NFS_LOCK_INITIALIZED 0
147#define NFS_LOCK_LOST 1 136#define NFS_LOCK_LOST 1
148 unsigned long ls_flags; 137 unsigned long ls_flags;
149 struct nfs_seqid_counter ls_seqid; 138 struct nfs_seqid_counter ls_seqid;
150 nfs4_stateid ls_stateid; 139 nfs4_stateid ls_stateid;
151 atomic_t ls_count; 140 atomic_t ls_count;
152 struct nfs4_lock_owner ls_owner; 141 fl_owner_t ls_owner;
142 struct work_struct ls_release;
153}; 143};
154 144
155/* bits for nfs4_state->flags */ 145/* bits for nfs4_state->flags */
@@ -337,11 +327,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
337 */ 327 */
338static inline void 328static inline void
339nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, 329nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
340 struct rpc_message *msg, struct nfs_pgio_data *wdata) 330 struct rpc_message *msg, struct nfs_pgio_header *hdr)
341{ 331{
342 if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && 332 if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
343 !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) 333 !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
344 wdata->args.stable = NFS_FILE_SYNC; 334 hdr->args.stable = NFS_FILE_SYNC;
345} 335}
346#else /* CONFIG_NFS_v4_1 */ 336#else /* CONFIG_NFS_v4_1 */
347static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 337static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +359,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
369 359
370static inline void 360static inline void
371nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, 361nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
372 struct rpc_message *msg, struct nfs_pgio_data *wdata) 362 struct rpc_message *msg, struct nfs_pgio_header *hdr)
373{ 363{
374} 364}
375#endif /* CONFIG_NFS_V4_1 */ 365#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index aa9ef4876046..53e435a95260 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -855,6 +855,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
855 }; 855 };
856 struct rpc_timeout ds_timeout; 856 struct rpc_timeout ds_timeout;
857 struct nfs_client *clp; 857 struct nfs_client *clp;
858 char buf[INET6_ADDRSTRLEN + 1];
859
860 if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
861 return ERR_PTR(-EINVAL);
862 cl_init.hostname = buf;
858 863
859 /* 864 /*
860 * Set an authflavor equual to the MDS value. Use the MDS nfs_client 865 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bf3d97cc5a0..75ae8d22f067 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1952,6 +1952,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1952 return status; 1952 return status;
1953} 1953}
1954 1954
1955/*
1956 * Additional permission checks in order to distinguish between an
1957 * open for read, and an open for execute. This works around the
1958 * fact that NFSv4 OPEN treats read and execute permissions as being
1959 * the same.
1960 * Note that in the non-execute case, we want to turn off permission
1961 * checking if we just created a new file (POSIX open() semantics).
1962 */
1955static int nfs4_opendata_access(struct rpc_cred *cred, 1963static int nfs4_opendata_access(struct rpc_cred *cred,
1956 struct nfs4_opendata *opendata, 1964 struct nfs4_opendata *opendata,
1957 struct nfs4_state *state, fmode_t fmode, 1965 struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1974,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
1966 return 0; 1974 return 0;
1967 1975
1968 mask = 0; 1976 mask = 0;
1969 /* don't check MAY_WRITE - a newly created file may not have 1977 /*
1970 * write mode bits, but POSIX allows the creating process to write. 1978 * Use openflags to check for exec, because fmode won't
1971 * use openflags to check for exec, because fmode won't 1979 * always have FMODE_EXEC set when file open for exec.
1972 * always have FMODE_EXEC set when file open for exec. */ 1980 */
1973 if (openflags & __FMODE_EXEC) { 1981 if (openflags & __FMODE_EXEC) {
1974 /* ONLY check for exec rights */ 1982 /* ONLY check for exec rights */
1975 mask = MAY_EXEC; 1983 mask = MAY_EXEC;
1976 } else if (fmode & FMODE_READ) 1984 } else if ((fmode & FMODE_READ) && !opendata->file_created)
1977 mask = MAY_READ; 1985 mask = MAY_READ;
1978 1986
1979 cache.cred = cred; 1987 cache.cred = cred;
@@ -2216,8 +2224,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
2216 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); 2224 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
2217 2225
2218 ret = _nfs4_proc_open(opendata); 2226 ret = _nfs4_proc_open(opendata);
2219 if (ret != 0) 2227 if (ret != 0) {
2228 if (ret == -ENOENT) {
2229 d_drop(opendata->dentry);
2230 d_add(opendata->dentry, NULL);
2231 nfs_set_verifier(opendata->dentry,
2232 nfs_save_change_attribute(opendata->dir->d_inode));
2233 }
2220 goto out; 2234 goto out;
2235 }
2221 2236
2222 state = nfs4_opendata_to_nfs4_state(opendata); 2237 state = nfs4_opendata_to_nfs4_state(opendata);
2223 ret = PTR_ERR(state); 2238 ret = PTR_ERR(state);
@@ -2647,6 +2662,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
2647 .rpc_release = nfs4_free_closedata, 2662 .rpc_release = nfs4_free_closedata,
2648}; 2663};
2649 2664
2665static bool nfs4_state_has_opener(struct nfs4_state *state)
2666{
2667 /* first check existing openers */
2668 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
2669 state->n_rdonly != 0)
2670 return true;
2671
2672 if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
2673 state->n_wronly != 0)
2674 return true;
2675
2676 if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
2677 state->n_rdwr != 0)
2678 return true;
2679
2680 return false;
2681}
2682
2683static bool nfs4_roc(struct inode *inode)
2684{
2685 struct nfs_inode *nfsi = NFS_I(inode);
2686 struct nfs_open_context *ctx;
2687 struct nfs4_state *state;
2688
2689 spin_lock(&inode->i_lock);
2690 list_for_each_entry(ctx, &nfsi->open_files, list) {
2691 state = ctx->state;
2692 if (state == NULL)
2693 continue;
2694 if (nfs4_state_has_opener(state)) {
2695 spin_unlock(&inode->i_lock);
2696 return false;
2697 }
2698 }
2699 spin_unlock(&inode->i_lock);
2700
2701 if (nfs4_check_delegation(inode, FMODE_READ))
2702 return false;
2703
2704 return pnfs_roc(inode);
2705}
2706
2650/* 2707/*
2651 * It is possible for data to be read/written from a mem-mapped file 2708 * It is possible for data to be read/written from a mem-mapped file
2652 * after the sys_close call (which hits the vfs layer as a flush). 2709 * after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2754,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2697 calldata->res.fattr = &calldata->fattr; 2754 calldata->res.fattr = &calldata->fattr;
2698 calldata->res.seqid = calldata->arg.seqid; 2755 calldata->res.seqid = calldata->arg.seqid;
2699 calldata->res.server = server; 2756 calldata->res.server = server;
2700 calldata->roc = pnfs_roc(state->inode); 2757 calldata->roc = nfs4_roc(state->inode);
2701 nfs_sb_active(calldata->inode->i_sb); 2758 nfs_sb_active(calldata->inode->i_sb);
2702 2759
2703 msg.rpc_argp = &calldata->arg; 2760 msg.rpc_argp = &calldata->arg;
@@ -4033,24 +4090,25 @@ static bool nfs4_error_stateid_expired(int err)
4033 return false; 4090 return false;
4034} 4091}
4035 4092
4036void __nfs4_read_done_cb(struct nfs_pgio_data *data) 4093void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
4037{ 4094{
4038 nfs_invalidate_atime(data->header->inode); 4095 nfs_invalidate_atime(hdr->inode);
4039} 4096}
4040 4097
4041static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) 4098static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
4042{ 4099{
4043 struct nfs_server *server = NFS_SERVER(data->header->inode); 4100 struct nfs_server *server = NFS_SERVER(hdr->inode);
4044 4101
4045 trace_nfs4_read(data, task->tk_status); 4102 trace_nfs4_read(hdr, task->tk_status);
4046 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 4103 if (nfs4_async_handle_error(task, server,
4104 hdr->args.context->state) == -EAGAIN) {
4047 rpc_restart_call_prepare(task); 4105 rpc_restart_call_prepare(task);
4048 return -EAGAIN; 4106 return -EAGAIN;
4049 } 4107 }
4050 4108
4051 __nfs4_read_done_cb(data); 4109 __nfs4_read_done_cb(hdr);
4052 if (task->tk_status > 0) 4110 if (task->tk_status > 0)
4053 renew_lease(server, data->timestamp); 4111 renew_lease(server, hdr->timestamp);
4054 return 0; 4112 return 0;
4055} 4113}
4056 4114
@@ -4068,54 +4126,59 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
4068 return true; 4126 return true;
4069} 4127}
4070 4128
4071static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) 4129static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
4072{ 4130{
4073 4131
4074 dprintk("--> %s\n", __func__); 4132 dprintk("--> %s\n", __func__);
4075 4133
4076 if (!nfs4_sequence_done(task, &data->res.seq_res)) 4134 if (!nfs4_sequence_done(task, &hdr->res.seq_res))
4077 return -EAGAIN; 4135 return -EAGAIN;
4078 if (nfs4_read_stateid_changed(task, &data->args)) 4136 if (nfs4_read_stateid_changed(task, &hdr->args))
4079 return -EAGAIN; 4137 return -EAGAIN;
4080 return data->pgio_done_cb ? data->pgio_done_cb(task, data) : 4138 return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
4081 nfs4_read_done_cb(task, data); 4139 nfs4_read_done_cb(task, hdr);
4082} 4140}
4083 4141
4084static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 4142static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
4143 struct rpc_message *msg)
4085{ 4144{
4086 data->timestamp = jiffies; 4145 hdr->timestamp = jiffies;
4087 data->pgio_done_cb = nfs4_read_done_cb; 4146 hdr->pgio_done_cb = nfs4_read_done_cb;
4088 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 4147 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
4089 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); 4148 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
4090} 4149}
4091 4150
4092static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) 4151static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
4152 struct nfs_pgio_header *hdr)
4093{ 4153{
4094 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 4154 if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
4095 &data->args.seq_args, 4155 &hdr->args.seq_args,
4096 &data->res.seq_res, 4156 &hdr->res.seq_res,
4097 task)) 4157 task))
4098 return 0; 4158 return 0;
4099 if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, 4159 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
4100 data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) 4160 hdr->args.lock_context,
4161 hdr->rw_ops->rw_mode) == -EIO)
4101 return -EIO; 4162 return -EIO;
4102 if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) 4163 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
4103 return -EIO; 4164 return -EIO;
4104 return 0; 4165 return 0;
4105} 4166}
4106 4167
4107static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) 4168static int nfs4_write_done_cb(struct rpc_task *task,
4169 struct nfs_pgio_header *hdr)
4108{ 4170{
4109 struct inode *inode = data->header->inode; 4171 struct inode *inode = hdr->inode;
4110 4172
4111 trace_nfs4_write(data, task->tk_status); 4173 trace_nfs4_write(hdr, task->tk_status);
4112 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 4174 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4175 hdr->args.context->state) == -EAGAIN) {
4113 rpc_restart_call_prepare(task); 4176 rpc_restart_call_prepare(task);
4114 return -EAGAIN; 4177 return -EAGAIN;
4115 } 4178 }
4116 if (task->tk_status >= 0) { 4179 if (task->tk_status >= 0) {
4117 renew_lease(NFS_SERVER(inode), data->timestamp); 4180 renew_lease(NFS_SERVER(inode), hdr->timestamp);
4118 nfs_post_op_update_inode_force_wcc(inode, &data->fattr); 4181 nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
4119 } 4182 }
4120 return 0; 4183 return 0;
4121} 4184}
@@ -4134,23 +4197,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
4134 return true; 4197 return true;
4135} 4198}
4136 4199
4137static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) 4200static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
4138{ 4201{
4139 if (!nfs4_sequence_done(task, &data->res.seq_res)) 4202 if (!nfs4_sequence_done(task, &hdr->res.seq_res))
4140 return -EAGAIN; 4203 return -EAGAIN;
4141 if (nfs4_write_stateid_changed(task, &data->args)) 4204 if (nfs4_write_stateid_changed(task, &hdr->args))
4142 return -EAGAIN; 4205 return -EAGAIN;
4143 return data->pgio_done_cb ? data->pgio_done_cb(task, data) : 4206 return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
4144 nfs4_write_done_cb(task, data); 4207 nfs4_write_done_cb(task, hdr);
4145} 4208}
4146 4209
4147static 4210static
4148bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) 4211bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
4149{ 4212{
4150 const struct nfs_pgio_header *hdr = data->header;
4151
4152 /* Don't request attributes for pNFS or O_DIRECT writes */ 4213 /* Don't request attributes for pNFS or O_DIRECT writes */
4153 if (data->ds_clp != NULL || hdr->dreq != NULL) 4214 if (hdr->ds_clp != NULL || hdr->dreq != NULL)
4154 return false; 4215 return false;
4155 /* Otherwise, request attributes if and only if we don't hold 4216 /* Otherwise, request attributes if and only if we don't hold
4156 * a delegation 4217 * a delegation
@@ -4158,23 +4219,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
4158 return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; 4219 return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
4159} 4220}
4160 4221
4161static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 4222static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
4223 struct rpc_message *msg)
4162{ 4224{
4163 struct nfs_server *server = NFS_SERVER(data->header->inode); 4225 struct nfs_server *server = NFS_SERVER(hdr->inode);
4164 4226
4165 if (!nfs4_write_need_cache_consistency_data(data)) { 4227 if (!nfs4_write_need_cache_consistency_data(hdr)) {
4166 data->args.bitmask = NULL; 4228 hdr->args.bitmask = NULL;
4167 data->res.fattr = NULL; 4229 hdr->res.fattr = NULL;
4168 } else 4230 } else
4169 data->args.bitmask = server->cache_consistency_bitmask; 4231 hdr->args.bitmask = server->cache_consistency_bitmask;
4170 4232
4171 if (!data->pgio_done_cb) 4233 if (!hdr->pgio_done_cb)
4172 data->pgio_done_cb = nfs4_write_done_cb; 4234 hdr->pgio_done_cb = nfs4_write_done_cb;
4173 data->res.server = server; 4235 hdr->res.server = server;
4174 data->timestamp = jiffies; 4236 hdr->timestamp = jiffies;
4175 4237
4176 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; 4238 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
4177 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 4239 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
4178} 4240}
4179 4241
4180static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 4242static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4881,6 +4943,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
4881 return scnprintf(buf, len, "tcp"); 4943 return scnprintf(buf, len, "tcp");
4882} 4944}
4883 4945
4946static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
4947{
4948 struct nfs4_setclientid *sc = calldata;
4949
4950 if (task->tk_status == 0)
4951 sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
4952}
4953
4954static const struct rpc_call_ops nfs4_setclientid_ops = {
4955 .rpc_call_done = nfs4_setclientid_done,
4956};
4957
4884/** 4958/**
4885 * nfs4_proc_setclientid - Negotiate client ID 4959 * nfs4_proc_setclientid - Negotiate client ID
4886 * @clp: state data structure 4960 * @clp: state data structure
@@ -4907,6 +4981,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4907 .rpc_resp = res, 4981 .rpc_resp = res,
4908 .rpc_cred = cred, 4982 .rpc_cred = cred,
4909 }; 4983 };
4984 struct rpc_task *task;
4985 struct rpc_task_setup task_setup_data = {
4986 .rpc_client = clp->cl_rpcclient,
4987 .rpc_message = &msg,
4988 .callback_ops = &nfs4_setclientid_ops,
4989 .callback_data = &setclientid,
4990 .flags = RPC_TASK_TIMEOUT,
4991 };
4910 int status; 4992 int status;
4911 4993
4912 /* nfs_client_id4 */ 4994 /* nfs_client_id4 */
@@ -4933,7 +5015,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4933 dprintk("NFS call setclientid auth=%s, '%.*s'\n", 5015 dprintk("NFS call setclientid auth=%s, '%.*s'\n",
4934 clp->cl_rpcclient->cl_auth->au_ops->au_name, 5016 clp->cl_rpcclient->cl_auth->au_ops->au_name,
4935 setclientid.sc_name_len, setclientid.sc_name); 5017 setclientid.sc_name_len, setclientid.sc_name);
4936 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 5018 task = rpc_run_task(&task_setup_data);
5019 if (IS_ERR(task)) {
5020 status = PTR_ERR(task);
5021 goto out;
5022 }
5023 status = task->tk_status;
5024 if (setclientid.sc_cred) {
5025 clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
5026 put_rpccred(setclientid.sc_cred);
5027 }
5028 rpc_put_task(task);
5029out:
4937 trace_nfs4_setclientid(clp, status); 5030 trace_nfs4_setclientid(clp, status);
4938 dprintk("NFS reply setclientid: %d\n", status); 5031 dprintk("NFS reply setclientid: %d\n", status);
4939 return status; 5032 return status;
@@ -4975,6 +5068,9 @@ struct nfs4_delegreturndata {
4975 unsigned long timestamp; 5068 unsigned long timestamp;
4976 struct nfs_fattr fattr; 5069 struct nfs_fattr fattr;
4977 int rpc_status; 5070 int rpc_status;
5071 struct inode *inode;
5072 bool roc;
5073 u32 roc_barrier;
4978}; 5074};
4979 5075
4980static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) 5076static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5084,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
4988 switch (task->tk_status) { 5084 switch (task->tk_status) {
4989 case 0: 5085 case 0:
4990 renew_lease(data->res.server, data->timestamp); 5086 renew_lease(data->res.server, data->timestamp);
4991 break;
4992 case -NFS4ERR_ADMIN_REVOKED: 5087 case -NFS4ERR_ADMIN_REVOKED:
4993 case -NFS4ERR_DELEG_REVOKED: 5088 case -NFS4ERR_DELEG_REVOKED:
4994 case -NFS4ERR_BAD_STATEID: 5089 case -NFS4ERR_BAD_STATEID:
@@ -4996,6 +5091,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
4996 case -NFS4ERR_STALE_STATEID: 5091 case -NFS4ERR_STALE_STATEID:
4997 case -NFS4ERR_EXPIRED: 5092 case -NFS4ERR_EXPIRED:
4998 task->tk_status = 0; 5093 task->tk_status = 0;
5094 if (data->roc)
5095 pnfs_roc_set_barrier(data->inode, data->roc_barrier);
4999 break; 5096 break;
5000 default: 5097 default:
5001 if (nfs4_async_handle_error(task, data->res.server, NULL) == 5098 if (nfs4_async_handle_error(task, data->res.server, NULL) ==
@@ -5009,6 +5106,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5009 5106
5010static void nfs4_delegreturn_release(void *calldata) 5107static void nfs4_delegreturn_release(void *calldata)
5011{ 5108{
5109 struct nfs4_delegreturndata *data = calldata;
5110
5111 if (data->roc)
5112 pnfs_roc_release(data->inode);
5012 kfree(calldata); 5113 kfree(calldata);
5013} 5114}
5014 5115
@@ -5018,6 +5119,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
5018 5119
5019 d_data = (struct nfs4_delegreturndata *)data; 5120 d_data = (struct nfs4_delegreturndata *)data;
5020 5121
5122 if (d_data->roc &&
5123 pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
5124 return;
5125
5021 nfs4_setup_sequence(d_data->res.server, 5126 nfs4_setup_sequence(d_data->res.server,
5022 &d_data->args.seq_args, 5127 &d_data->args.seq_args,
5023 &d_data->res.seq_res, 5128 &d_data->res.seq_res,
@@ -5061,6 +5166,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
5061 nfs_fattr_init(data->res.fattr); 5166 nfs_fattr_init(data->res.fattr);
5062 data->timestamp = jiffies; 5167 data->timestamp = jiffies;
5063 data->rpc_status = 0; 5168 data->rpc_status = 0;
5169 data->inode = inode;
5170 data->roc = list_empty(&NFS_I(inode)->open_files) ?
5171 pnfs_roc(inode) : false;
5064 5172
5065 task_setup_data.callback_data = data; 5173 task_setup_data.callback_data = data;
5066 msg.rpc_argp = &data->args; 5174 msg.rpc_argp = &data->args;
@@ -5834,8 +5942,10 @@ struct nfs_release_lockowner_data {
5834static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) 5942static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
5835{ 5943{
5836 struct nfs_release_lockowner_data *data = calldata; 5944 struct nfs_release_lockowner_data *data = calldata;
5837 nfs40_setup_sequence(data->server, 5945 struct nfs_server *server = data->server;
5838 &data->args.seq_args, &data->res.seq_res, task); 5946 nfs40_setup_sequence(server, &data->args.seq_args,
5947 &data->res.seq_res, task);
5948 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
5839 data->timestamp = jiffies; 5949 data->timestamp = jiffies;
5840} 5950}
5841 5951
@@ -5852,6 +5962,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5852 break; 5962 break;
5853 case -NFS4ERR_STALE_CLIENTID: 5963 case -NFS4ERR_STALE_CLIENTID:
5854 case -NFS4ERR_EXPIRED: 5964 case -NFS4ERR_EXPIRED:
5965 nfs4_schedule_lease_recovery(server->nfs_client);
5966 break;
5855 case -NFS4ERR_LEASE_MOVED: 5967 case -NFS4ERR_LEASE_MOVED:
5856 case -NFS4ERR_DELAY: 5968 case -NFS4ERR_DELAY:
5857 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) 5969 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
@@ -5872,7 +5984,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
5872 .rpc_release = nfs4_release_lockowner_release, 5984 .rpc_release = nfs4_release_lockowner_release,
5873}; 5985};
5874 5986
5875static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) 5987static void
5988nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
5876{ 5989{
5877 struct nfs_release_lockowner_data *data; 5990 struct nfs_release_lockowner_data *data;
5878 struct rpc_message msg = { 5991 struct rpc_message msg = {
@@ -5880,11 +5993,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5880 }; 5993 };
5881 5994
5882 if (server->nfs_client->cl_mvops->minor_version != 0) 5995 if (server->nfs_client->cl_mvops->minor_version != 0)
5883 return -EINVAL; 5996 return;
5884 5997
5885 data = kmalloc(sizeof(*data), GFP_NOFS); 5998 data = kmalloc(sizeof(*data), GFP_NOFS);
5886 if (!data) 5999 if (!data)
5887 return -ENOMEM; 6000 return;
5888 data->lsp = lsp; 6001 data->lsp = lsp;
5889 data->server = server; 6002 data->server = server;
5890 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 6003 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6008,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5895 msg.rpc_resp = &data->res; 6008 msg.rpc_resp = &data->res;
5896 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); 6009 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
5897 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); 6010 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
5898 return 0;
5899} 6011}
5900 6012
5901#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 6013#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -8182,7 +8294,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
8182 return ret; 8294 return ret;
8183} 8295}
8184 8296
8185static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) 8297static void
8298nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
8186{ 8299{
8187 struct rpc_task *task; 8300 struct rpc_task *task;
8188 struct rpc_cred *cred = lsp->ls_state->owner->so_cred; 8301 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8303,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
8190 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); 8303 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
8191 nfs4_free_lock_state(server, lsp); 8304 nfs4_free_lock_state(server, lsp);
8192 if (IS_ERR(task)) 8305 if (IS_ERR(task))
8193 return PTR_ERR(task); 8306 return;
8194 rpc_put_task(task); 8307 rpc_put_task(task);
8195 return 0;
8196} 8308}
8197 8309
8198static bool nfs41_match_stateid(const nfs4_stateid *s1, 8310static bool nfs41_match_stateid(const nfs4_stateid *s1,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 42f121182167..a043f618cd5a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,33 +787,36 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
787 * that is compatible with current->files 787 * that is compatible with current->files
788 */ 788 */
789static struct nfs4_lock_state * 789static struct nfs4_lock_state *
790__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 790__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
791{ 791{
792 struct nfs4_lock_state *pos; 792 struct nfs4_lock_state *pos;
793 list_for_each_entry(pos, &state->lock_states, ls_locks) { 793 list_for_each_entry(pos, &state->lock_states, ls_locks) {
794 if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) 794 if (pos->ls_owner != fl_owner)
795 continue; 795 continue;
796 switch (pos->ls_owner.lo_type) {
797 case NFS4_POSIX_LOCK_TYPE:
798 if (pos->ls_owner.lo_u.posix_owner != fl_owner)
799 continue;
800 break;
801 case NFS4_FLOCK_LOCK_TYPE:
802 if (pos->ls_owner.lo_u.flock_owner != fl_pid)
803 continue;
804 }
805 atomic_inc(&pos->ls_count); 796 atomic_inc(&pos->ls_count);
806 return pos; 797 return pos;
807 } 798 }
808 return NULL; 799 return NULL;
809} 800}
810 801
802static void
803free_lock_state_work(struct work_struct *work)
804{
805 struct nfs4_lock_state *lsp = container_of(work,
806 struct nfs4_lock_state, ls_release);
807 struct nfs4_state *state = lsp->ls_state;
808 struct nfs_server *server = state->owner->so_server;
809 struct nfs_client *clp = server->nfs_client;
810
811 clp->cl_mvops->free_lock_state(server, lsp);
812}
813
811/* 814/*
812 * Return a compatible lock_state. If no initialized lock_state structure 815 * Return a compatible lock_state. If no initialized lock_state structure
813 * exists, return an uninitialized one. 816 * exists, return an uninitialized one.
814 * 817 *
815 */ 818 */
816static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 819static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
817{ 820{
818 struct nfs4_lock_state *lsp; 821 struct nfs4_lock_state *lsp;
819 struct nfs_server *server = state->owner->so_server; 822 struct nfs_server *server = state->owner->so_server;
@@ -824,21 +827,12 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
824 nfs4_init_seqid_counter(&lsp->ls_seqid); 827 nfs4_init_seqid_counter(&lsp->ls_seqid);
825 atomic_set(&lsp->ls_count, 1); 828 atomic_set(&lsp->ls_count, 1);
826 lsp->ls_state = state; 829 lsp->ls_state = state;
827 lsp->ls_owner.lo_type = type; 830 lsp->ls_owner = fl_owner;
828 switch (lsp->ls_owner.lo_type) {
829 case NFS4_FLOCK_LOCK_TYPE:
830 lsp->ls_owner.lo_u.flock_owner = fl_pid;
831 break;
832 case NFS4_POSIX_LOCK_TYPE:
833 lsp->ls_owner.lo_u.posix_owner = fl_owner;
834 break;
835 default:
836 goto out_free;
837 }
838 lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); 831 lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
839 if (lsp->ls_seqid.owner_id < 0) 832 if (lsp->ls_seqid.owner_id < 0)
840 goto out_free; 833 goto out_free;
841 INIT_LIST_HEAD(&lsp->ls_locks); 834 INIT_LIST_HEAD(&lsp->ls_locks);
835 INIT_WORK(&lsp->ls_release, free_lock_state_work);
842 return lsp; 836 return lsp;
843out_free: 837out_free:
844 kfree(lsp); 838 kfree(lsp);
@@ -857,13 +851,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
857 * exists, return an uninitialized one. 851 * exists, return an uninitialized one.
858 * 852 *
859 */ 853 */
860static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) 854static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
861{ 855{
862 struct nfs4_lock_state *lsp, *new = NULL; 856 struct nfs4_lock_state *lsp, *new = NULL;
863 857
864 for(;;) { 858 for(;;) {
865 spin_lock(&state->state_lock); 859 spin_lock(&state->state_lock);
866 lsp = __nfs4_find_lock_state(state, owner, pid, type); 860 lsp = __nfs4_find_lock_state(state, owner);
867 if (lsp != NULL) 861 if (lsp != NULL)
868 break; 862 break;
869 if (new != NULL) { 863 if (new != NULL) {
@@ -874,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
874 break; 868 break;
875 } 869 }
876 spin_unlock(&state->state_lock); 870 spin_unlock(&state->state_lock);
877 new = nfs4_alloc_lock_state(state, owner, pid, type); 871 new = nfs4_alloc_lock_state(state, owner);
878 if (new == NULL) 872 if (new == NULL)
879 return NULL; 873 return NULL;
880 } 874 }
@@ -902,13 +896,12 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
902 if (list_empty(&state->lock_states)) 896 if (list_empty(&state->lock_states))
903 clear_bit(LK_STATE_IN_USE, &state->flags); 897 clear_bit(LK_STATE_IN_USE, &state->flags);
904 spin_unlock(&state->state_lock); 898 spin_unlock(&state->state_lock);
905 server = state->owner->so_server; 899 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
906 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { 900 queue_work(nfsiod_workqueue, &lsp->ls_release);
907 struct nfs_client *clp = server->nfs_client; 901 else {
908 902 server = state->owner->so_server;
909 clp->cl_mvops->free_lock_state(server, lsp);
910 } else
911 nfs4_free_lock_state(server, lsp); 903 nfs4_free_lock_state(server, lsp);
904 }
912} 905}
913 906
914static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 907static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -935,13 +928,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
935 928
936 if (fl->fl_ops != NULL) 929 if (fl->fl_ops != NULL)
937 return 0; 930 return 0;
938 if (fl->fl_flags & FL_POSIX) 931 lsp = nfs4_get_lock_state(state, fl->fl_owner);
939 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
940 else if (fl->fl_flags & FL_FLOCK)
941 lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
942 NFS4_FLOCK_LOCK_TYPE);
943 else
944 return -EINVAL;
945 if (lsp == NULL) 932 if (lsp == NULL)
946 return -ENOMEM; 933 return -ENOMEM;
947 fl->fl_u.nfs4_fl.owner = lsp; 934 fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +942,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
955{ 942{
956 struct nfs4_lock_state *lsp; 943 struct nfs4_lock_state *lsp;
957 fl_owner_t fl_owner; 944 fl_owner_t fl_owner;
958 pid_t fl_pid;
959 int ret = -ENOENT; 945 int ret = -ENOENT;
960 946
961 947
@@ -966,9 +952,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
966 goto out; 952 goto out;
967 953
968 fl_owner = lockowner->l_owner; 954 fl_owner = lockowner->l_owner;
969 fl_pid = lockowner->l_pid;
970 spin_lock(&state->state_lock); 955 spin_lock(&state->state_lock);
971 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); 956 lsp = __nfs4_find_lock_state(state, fl_owner);
972 if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) 957 if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
973 ret = -EIO; 958 ret = -EIO;
974 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { 959 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0a744f3a86f6..1c32adbe728d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
932 932
933DECLARE_EVENT_CLASS(nfs4_read_event, 933DECLARE_EVENT_CLASS(nfs4_read_event,
934 TP_PROTO( 934 TP_PROTO(
935 const struct nfs_pgio_data *data, 935 const struct nfs_pgio_header *hdr,
936 int error 936 int error
937 ), 937 ),
938 938
939 TP_ARGS(data, error), 939 TP_ARGS(hdr, error),
940 940
941 TP_STRUCT__entry( 941 TP_STRUCT__entry(
942 __field(dev_t, dev) 942 __field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
948 ), 948 ),
949 949
950 TP_fast_assign( 950 TP_fast_assign(
951 const struct inode *inode = data->header->inode; 951 const struct inode *inode = hdr->inode;
952 __entry->dev = inode->i_sb->s_dev; 952 __entry->dev = inode->i_sb->s_dev;
953 __entry->fileid = NFS_FILEID(inode); 953 __entry->fileid = NFS_FILEID(inode);
954 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); 954 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
955 __entry->offset = data->args.offset; 955 __entry->offset = hdr->args.offset;
956 __entry->count = data->args.count; 956 __entry->count = hdr->args.count;
957 __entry->error = error; 957 __entry->error = error;
958 ), 958 ),
959 959
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
972#define DEFINE_NFS4_READ_EVENT(name) \ 972#define DEFINE_NFS4_READ_EVENT(name) \
973 DEFINE_EVENT(nfs4_read_event, name, \ 973 DEFINE_EVENT(nfs4_read_event, name, \
974 TP_PROTO( \ 974 TP_PROTO( \
975 const struct nfs_pgio_data *data, \ 975 const struct nfs_pgio_header *hdr, \
976 int error \ 976 int error \
977 ), \ 977 ), \
978 TP_ARGS(data, error)) 978 TP_ARGS(hdr, error))
979DEFINE_NFS4_READ_EVENT(nfs4_read); 979DEFINE_NFS4_READ_EVENT(nfs4_read);
980#ifdef CONFIG_NFS_V4_1 980#ifdef CONFIG_NFS_V4_1
981DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); 981DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
983 983
984DECLARE_EVENT_CLASS(nfs4_write_event, 984DECLARE_EVENT_CLASS(nfs4_write_event,
985 TP_PROTO( 985 TP_PROTO(
986 const struct nfs_pgio_data *data, 986 const struct nfs_pgio_header *hdr,
987 int error 987 int error
988 ), 988 ),
989 989
990 TP_ARGS(data, error), 990 TP_ARGS(hdr, error),
991 991
992 TP_STRUCT__entry( 992 TP_STRUCT__entry(
993 __field(dev_t, dev) 993 __field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
999 ), 999 ),
1000 1000
1001 TP_fast_assign( 1001 TP_fast_assign(
1002 const struct inode *inode = data->header->inode; 1002 const struct inode *inode = hdr->inode;
1003 __entry->dev = inode->i_sb->s_dev; 1003 __entry->dev = inode->i_sb->s_dev;
1004 __entry->fileid = NFS_FILEID(inode); 1004 __entry->fileid = NFS_FILEID(inode);
1005 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); 1005 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
1006 __entry->offset = data->args.offset; 1006 __entry->offset = hdr->args.offset;
1007 __entry->count = data->args.count; 1007 __entry->count = hdr->args.count;
1008 __entry->error = error; 1008 __entry->error = error;
1009 ), 1009 ),
1010 1010
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
1024#define DEFINE_NFS4_WRITE_EVENT(name) \ 1024#define DEFINE_NFS4_WRITE_EVENT(name) \
1025 DEFINE_EVENT(nfs4_write_event, name, \ 1025 DEFINE_EVENT(nfs4_write_event, name, \
1026 TP_PROTO( \ 1026 TP_PROTO( \
1027 const struct nfs_pgio_data *data, \ 1027 const struct nfs_pgio_header *hdr, \
1028 int error \ 1028 int error \
1029 ), \ 1029 ), \
1030 TP_ARGS(data, error)) 1030 TP_ARGS(hdr, error))
1031DEFINE_NFS4_WRITE_EVENT(nfs4_write); 1031DEFINE_NFS4_WRITE_EVENT(nfs4_write);
1032#ifdef CONFIG_NFS_V4_1 1032#ifdef CONFIG_NFS_V4_1
1033DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); 1033DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 939ae606cfa4..e13b59d8d9aa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7092,7 +7092,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
7092 if (!status) 7092 if (!status)
7093 status = decode_sequence(xdr, &res->seq_res, rqstp); 7093 status = decode_sequence(xdr, &res->seq_res, rqstp);
7094 if (!status) 7094 if (!status)
7095 status = decode_reclaim_complete(xdr, (void *)NULL); 7095 status = decode_reclaim_complete(xdr, NULL);
7096 return status; 7096 return status;
7097} 7097}
7098 7098
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 611320753db2..ae05278b3761 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -439,22 +439,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
439 objlayout_read_done(&objios->oir, status, objios->sync); 439 objlayout_read_done(&objios->oir, status, objios->sync);
440} 440}
441 441
442int objio_read_pagelist(struct nfs_pgio_data *rdata) 442int objio_read_pagelist(struct nfs_pgio_header *hdr)
443{ 443{
444 struct nfs_pgio_header *hdr = rdata->header;
445 struct objio_state *objios; 444 struct objio_state *objios;
446 int ret; 445 int ret;
447 446
448 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, 447 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
449 hdr->lseg, rdata->args.pages, rdata->args.pgbase, 448 hdr->lseg, hdr->args.pages, hdr->args.pgbase,
450 rdata->args.offset, rdata->args.count, rdata, 449 hdr->args.offset, hdr->args.count, hdr,
451 GFP_KERNEL, &objios); 450 GFP_KERNEL, &objios);
452 if (unlikely(ret)) 451 if (unlikely(ret))
453 return ret; 452 return ret;
454 453
455 objios->ios->done = _read_done; 454 objios->ios->done = _read_done;
456 dprintk("%s: offset=0x%llx length=0x%x\n", __func__, 455 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
457 rdata->args.offset, rdata->args.count); 456 hdr->args.offset, hdr->args.count);
458 ret = ore_read(objios->ios); 457 ret = ore_read(objios->ios);
459 if (unlikely(ret)) 458 if (unlikely(ret))
460 objio_free_result(&objios->oir); 459 objio_free_result(&objios->oir);
@@ -487,11 +486,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
487static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) 486static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
488{ 487{
489 struct objio_state *objios = priv; 488 struct objio_state *objios = priv;
490 struct nfs_pgio_data *wdata = objios->oir.rpcdata; 489 struct nfs_pgio_header *hdr = objios->oir.rpcdata;
491 struct address_space *mapping = wdata->header->inode->i_mapping; 490 struct address_space *mapping = hdr->inode->i_mapping;
492 pgoff_t index = offset / PAGE_SIZE; 491 pgoff_t index = offset / PAGE_SIZE;
493 struct page *page; 492 struct page *page;
494 loff_t i_size = i_size_read(wdata->header->inode); 493 loff_t i_size = i_size_read(hdr->inode);
495 494
496 if (offset >= i_size) { 495 if (offset >= i_size) {
497 *uptodate = true; 496 *uptodate = true;
@@ -531,15 +530,14 @@ static const struct _ore_r4w_op _r4w_op = {
531 .put_page = &__r4w_put_page, 530 .put_page = &__r4w_put_page,
532}; 531};
533 532
534int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) 533int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
535{ 534{
536 struct nfs_pgio_header *hdr = wdata->header;
537 struct objio_state *objios; 535 struct objio_state *objios;
538 int ret; 536 int ret;
539 537
540 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, 538 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
541 hdr->lseg, wdata->args.pages, wdata->args.pgbase, 539 hdr->lseg, hdr->args.pages, hdr->args.pgbase,
542 wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, 540 hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
543 &objios); 541 &objios);
544 if (unlikely(ret)) 542 if (unlikely(ret))
545 return ret; 543 return ret;
@@ -551,7 +549,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
551 objios->ios->done = _write_done; 549 objios->ios->done = _write_done;
552 550
553 dprintk("%s: offset=0x%llx length=0x%x\n", __func__, 551 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
554 wdata->args.offset, wdata->args.count); 552 hdr->args.offset, hdr->args.count);
555 ret = ore_write(objios->ios); 553 ret = ore_write(objios->ios);
556 if (unlikely(ret)) { 554 if (unlikely(ret)) {
557 objio_free_result(&objios->oir); 555 objio_free_result(&objios->oir);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 765d3f54e986..697a16d11fac 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
229static void _rpc_read_complete(struct work_struct *work) 229static void _rpc_read_complete(struct work_struct *work)
230{ 230{
231 struct rpc_task *task; 231 struct rpc_task *task;
232 struct nfs_pgio_data *rdata; 232 struct nfs_pgio_header *hdr;
233 233
234 dprintk("%s enter\n", __func__); 234 dprintk("%s enter\n", __func__);
235 task = container_of(work, struct rpc_task, u.tk_work); 235 task = container_of(work, struct rpc_task, u.tk_work);
236 rdata = container_of(task, struct nfs_pgio_data, task); 236 hdr = container_of(task, struct nfs_pgio_header, task);
237 237
238 pnfs_ld_read_done(rdata); 238 pnfs_ld_read_done(hdr);
239} 239}
240 240
241void 241void
242objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) 242objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
243{ 243{
244 struct nfs_pgio_data *rdata = oir->rpcdata; 244 struct nfs_pgio_header *hdr = oir->rpcdata;
245 245
246 oir->status = rdata->task.tk_status = status; 246 oir->status = hdr->task.tk_status = status;
247 if (status >= 0) 247 if (status >= 0)
248 rdata->res.count = status; 248 hdr->res.count = status;
249 else 249 else
250 rdata->header->pnfs_error = status; 250 hdr->pnfs_error = status;
251 objlayout_iodone(oir); 251 objlayout_iodone(oir);
252 /* must not use oir after this point */ 252 /* must not use oir after this point */
253 253
254 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, 254 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
255 status, rdata->res.eof, sync); 255 status, hdr->res.eof, sync);
256 256
257 if (sync) 257 if (sync)
258 pnfs_ld_read_done(rdata); 258 pnfs_ld_read_done(hdr);
259 else { 259 else {
260 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); 260 INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
261 schedule_work(&rdata->task.u.tk_work); 261 schedule_work(&hdr->task.u.tk_work);
262 } 262 }
263} 263}
264 264
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
266 * Perform sync or async reads. 266 * Perform sync or async reads.
267 */ 267 */
268enum pnfs_try_status 268enum pnfs_try_status
269objlayout_read_pagelist(struct nfs_pgio_data *rdata) 269objlayout_read_pagelist(struct nfs_pgio_header *hdr)
270{ 270{
271 struct nfs_pgio_header *hdr = rdata->header;
272 struct inode *inode = hdr->inode; 271 struct inode *inode = hdr->inode;
273 loff_t offset = rdata->args.offset; 272 loff_t offset = hdr->args.offset;
274 size_t count = rdata->args.count; 273 size_t count = hdr->args.count;
275 int err; 274 int err;
276 loff_t eof; 275 loff_t eof;
277 276
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
279 if (unlikely(offset + count > eof)) { 278 if (unlikely(offset + count > eof)) {
280 if (offset >= eof) { 279 if (offset >= eof) {
281 err = 0; 280 err = 0;
282 rdata->res.count = 0; 281 hdr->res.count = 0;
283 rdata->res.eof = 1; 282 hdr->res.eof = 1;
284 /*FIXME: do we need to call pnfs_ld_read_done() */ 283 /*FIXME: do we need to call pnfs_ld_read_done() */
285 goto out; 284 goto out;
286 } 285 }
287 count = eof - offset; 286 count = eof - offset;
288 } 287 }
289 288
290 rdata->res.eof = (offset + count) >= eof; 289 hdr->res.eof = (offset + count) >= eof;
291 _fix_verify_io_params(hdr->lseg, &rdata->args.pages, 290 _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
292 &rdata->args.pgbase, 291 &hdr->args.pgbase,
293 rdata->args.offset, rdata->args.count); 292 hdr->args.offset, hdr->args.count);
294 293
295 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", 294 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
296 __func__, inode->i_ino, offset, count, rdata->res.eof); 295 __func__, inode->i_ino, offset, count, hdr->res.eof);
297 296
298 err = objio_read_pagelist(rdata); 297 err = objio_read_pagelist(hdr);
299 out: 298 out:
300 if (unlikely(err)) { 299 if (unlikely(err)) {
301 hdr->pnfs_error = err; 300 hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
312static void _rpc_write_complete(struct work_struct *work) 311static void _rpc_write_complete(struct work_struct *work)
313{ 312{
314 struct rpc_task *task; 313 struct rpc_task *task;
315 struct nfs_pgio_data *wdata; 314 struct nfs_pgio_header *hdr;
316 315
317 dprintk("%s enter\n", __func__); 316 dprintk("%s enter\n", __func__);
318 task = container_of(work, struct rpc_task, u.tk_work); 317 task = container_of(work, struct rpc_task, u.tk_work);
319 wdata = container_of(task, struct nfs_pgio_data, task); 318 hdr = container_of(task, struct nfs_pgio_header, task);
320 319
321 pnfs_ld_write_done(wdata); 320 pnfs_ld_write_done(hdr);
322} 321}
323 322
324void 323void
325objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) 324objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
326{ 325{
327 struct nfs_pgio_data *wdata = oir->rpcdata; 326 struct nfs_pgio_header *hdr = oir->rpcdata;
328 327
329 oir->status = wdata->task.tk_status = status; 328 oir->status = hdr->task.tk_status = status;
330 if (status >= 0) { 329 if (status >= 0) {
331 wdata->res.count = status; 330 hdr->res.count = status;
332 wdata->verf.committed = oir->committed; 331 hdr->verf.committed = oir->committed;
333 } else { 332 } else {
334 wdata->header->pnfs_error = status; 333 hdr->pnfs_error = status;
335 } 334 }
336 objlayout_iodone(oir); 335 objlayout_iodone(oir);
337 /* must not use oir after this point */ 336 /* must not use oir after this point */
338 337
339 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, 338 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
340 status, wdata->verf.committed, sync); 339 status, hdr->verf.committed, sync);
341 340
342 if (sync) 341 if (sync)
343 pnfs_ld_write_done(wdata); 342 pnfs_ld_write_done(hdr);
344 else { 343 else {
345 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); 344 INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
346 schedule_work(&wdata->task.u.tk_work); 345 schedule_work(&hdr->task.u.tk_work);
347 } 346 }
348} 347}
349 348
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
351 * Perform sync or async writes. 350 * Perform sync or async writes.
352 */ 351 */
353enum pnfs_try_status 352enum pnfs_try_status
354objlayout_write_pagelist(struct nfs_pgio_data *wdata, 353objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
355 int how)
356{ 354{
357 struct nfs_pgio_header *hdr = wdata->header;
358 int err; 355 int err;
359 356
360 _fix_verify_io_params(hdr->lseg, &wdata->args.pages, 357 _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
361 &wdata->args.pgbase, 358 &hdr->args.pgbase,
362 wdata->args.offset, wdata->args.count); 359 hdr->args.offset, hdr->args.count);
363 360
364 err = objio_write_pagelist(wdata, how); 361 err = objio_write_pagelist(hdr, how);
365 if (unlikely(err)) { 362 if (unlikely(err)) {
366 hdr->pnfs_error = err; 363 hdr->pnfs_error = err;
367 dprintk("%s: Returned Error %d\n", __func__, err); 364 dprintk("%s: Returned Error %d\n", __func__, err);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 01e041029a6c..fd13f1d2f136 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
119 */ 119 */
120extern void objio_free_result(struct objlayout_io_res *oir); 120extern void objio_free_result(struct objlayout_io_res *oir);
121 121
122extern int objio_read_pagelist(struct nfs_pgio_data *rdata); 122extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
123extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); 123extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
124 124
125/* 125/*
126 * callback API 126 * callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
168extern void objlayout_free_lseg(struct pnfs_layout_segment *); 168extern void objlayout_free_lseg(struct pnfs_layout_segment *);
169 169
170extern enum pnfs_try_status objlayout_read_pagelist( 170extern enum pnfs_try_status objlayout_read_pagelist(
171 struct nfs_pgio_data *); 171 struct nfs_pgio_header *);
172 172
173extern enum pnfs_try_status objlayout_write_pagelist( 173extern enum pnfs_try_status objlayout_write_pagelist(
174 struct nfs_pgio_data *, 174 struct nfs_pgio_header *,
175 int how); 175 int how);
176 176
177extern void objlayout_encode_layoutcommit( 177extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0be5050638f7..ba491926df5f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -141,16 +141,24 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
141 * @req - request in group that is to be locked 141 * @req - request in group that is to be locked
142 * 142 *
143 * this lock must be held if modifying the page group list 143 * this lock must be held if modifying the page group list
144 *
145 * returns result from wait_on_bit_lock: 0 on success, < 0 on error
144 */ 146 */
145void 147int
146nfs_page_group_lock(struct nfs_page *req) 148nfs_page_group_lock(struct nfs_page *req, bool wait)
147{ 149{
148 struct nfs_page *head = req->wb_head; 150 struct nfs_page *head = req->wb_head;
151 int ret;
149 152
150 WARN_ON_ONCE(head != head->wb_head); 153 WARN_ON_ONCE(head != head->wb_head);
151 154
152 wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, 155 do {
156 ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
153 TASK_UNINTERRUPTIBLE); 157 TASK_UNINTERRUPTIBLE);
158 } while (wait && ret != 0);
159
160 WARN_ON_ONCE(ret > 0);
161 return ret;
154} 162}
155 163
156/* 164/*
@@ -211,7 +219,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
211{ 219{
212 bool ret; 220 bool ret;
213 221
214 nfs_page_group_lock(req); 222 nfs_page_group_lock(req, true);
215 ret = nfs_page_group_sync_on_bit_locked(req, bit); 223 ret = nfs_page_group_sync_on_bit_locked(req, bit);
216 nfs_page_group_unlock(req); 224 nfs_page_group_unlock(req);
217 225
@@ -454,123 +462,72 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
454} 462}
455EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 463EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
456 464
457static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) 465struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
458{
459 return container_of(hdr, struct nfs_rw_header, header);
460}
461
462/**
463 * nfs_rw_header_alloc - Allocate a header for a read or write
464 * @ops: Read or write function vector
465 */
466struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
467{ 466{
468 struct nfs_rw_header *header = ops->rw_alloc_header(); 467 struct nfs_pgio_header *hdr = ops->rw_alloc_header();
469
470 if (header) {
471 struct nfs_pgio_header *hdr = &header->header;
472 468
469 if (hdr) {
473 INIT_LIST_HEAD(&hdr->pages); 470 INIT_LIST_HEAD(&hdr->pages);
474 spin_lock_init(&hdr->lock); 471 spin_lock_init(&hdr->lock);
475 atomic_set(&hdr->refcnt, 0);
476 hdr->rw_ops = ops; 472 hdr->rw_ops = ops;
477 } 473 }
478 return header; 474 return hdr;
479} 475}
480EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); 476EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
481 477
482/* 478/*
483 * nfs_rw_header_free - Free a read or write header 479 * nfs_pgio_header_free - Free a read or write header
484 * @hdr: The header to free 480 * @hdr: The header to free
485 */ 481 */
486void nfs_rw_header_free(struct nfs_pgio_header *hdr) 482void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
487{ 483{
488 hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); 484 hdr->rw_ops->rw_free_header(hdr);
489} 485}
490EXPORT_SYMBOL_GPL(nfs_rw_header_free); 486EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
491 487
492/** 488/**
493 * nfs_pgio_data_alloc - Allocate pageio data 489 * nfs_pgio_data_destroy - make @hdr suitable for reuse
494 * @hdr: The header making a request 490 *
495 * @pagecount: Number of pages to create 491 * Frees memory and releases refs from nfs_generic_pgio, so that it may
496 */ 492 * be called again.
497static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, 493 *
498 unsigned int pagecount) 494 * @hdr: A header that has had nfs_generic_pgio called
499{
500 struct nfs_pgio_data *data, *prealloc;
501
502 prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
503 if (prealloc->header == NULL)
504 data = prealloc;
505 else
506 data = kzalloc(sizeof(*data), GFP_KERNEL);
507 if (!data)
508 goto out;
509
510 if (nfs_pgarray_set(&data->pages, pagecount)) {
511 data->header = hdr;
512 atomic_inc(&hdr->refcnt);
513 } else {
514 if (data != prealloc)
515 kfree(data);
516 data = NULL;
517 }
518out:
519 return data;
520}
521
522/**
523 * nfs_pgio_data_release - Properly free pageio data
524 * @data: The data to release
525 */ 495 */
526void nfs_pgio_data_release(struct nfs_pgio_data *data) 496void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
527{ 497{
528 struct nfs_pgio_header *hdr = data->header; 498 put_nfs_open_context(hdr->args.context);
529 struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); 499 if (hdr->page_array.pagevec != hdr->page_array.page_array)
530 500 kfree(hdr->page_array.pagevec);
531 put_nfs_open_context(data->args.context);
532 if (data->pages.pagevec != data->pages.page_array)
533 kfree(data->pages.pagevec);
534 if (data == &pageio_header->rpc_data) {
535 data->header = NULL;
536 data = NULL;
537 }
538 if (atomic_dec_and_test(&hdr->refcnt))
539 hdr->completion_ops->completion(hdr);
540 /* Note: we only free the rpc_task after callbacks are done.
541 * See the comment in rpc_free_task() for why
542 */
543 kfree(data);
544} 501}
545EXPORT_SYMBOL_GPL(nfs_pgio_data_release); 502EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
546 503
547/** 504/**
548 * nfs_pgio_rpcsetup - Set up arguments for a pageio call 505 * nfs_pgio_rpcsetup - Set up arguments for a pageio call
549 * @data: The pageio data 506 * @hdr: The pageio hdr
550 * @count: Number of bytes to read 507 * @count: Number of bytes to read
551 * @offset: Initial offset 508 * @offset: Initial offset
552 * @how: How to commit data (writes only) 509 * @how: How to commit data (writes only)
553 * @cinfo: Commit information for the call (writes only) 510 * @cinfo: Commit information for the call (writes only)
554 */ 511 */
555static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, 512static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
556 unsigned int count, unsigned int offset, 513 unsigned int count, unsigned int offset,
557 int how, struct nfs_commit_info *cinfo) 514 int how, struct nfs_commit_info *cinfo)
558{ 515{
559 struct nfs_page *req = data->header->req; 516 struct nfs_page *req = hdr->req;
560 517
561 /* Set up the RPC argument and reply structs 518 /* Set up the RPC argument and reply structs
562 * NB: take care not to mess about with data->commit et al. */ 519 * NB: take care not to mess about with hdr->commit et al. */
563 520
564 data->args.fh = NFS_FH(data->header->inode); 521 hdr->args.fh = NFS_FH(hdr->inode);
565 data->args.offset = req_offset(req) + offset; 522 hdr->args.offset = req_offset(req) + offset;
566 /* pnfs_set_layoutcommit needs this */ 523 /* pnfs_set_layoutcommit needs this */
567 data->mds_offset = data->args.offset; 524 hdr->mds_offset = hdr->args.offset;
568 data->args.pgbase = req->wb_pgbase + offset; 525 hdr->args.pgbase = req->wb_pgbase + offset;
569 data->args.pages = data->pages.pagevec; 526 hdr->args.pages = hdr->page_array.pagevec;
570 data->args.count = count; 527 hdr->args.count = count;
571 data->args.context = get_nfs_open_context(req->wb_context); 528 hdr->args.context = get_nfs_open_context(req->wb_context);
572 data->args.lock_context = req->wb_lock_context; 529 hdr->args.lock_context = req->wb_lock_context;
573 data->args.stable = NFS_UNSTABLE; 530 hdr->args.stable = NFS_UNSTABLE;
574 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { 531 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
575 case 0: 532 case 0:
576 break; 533 break;
@@ -578,59 +535,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
578 if (nfs_reqs_to_commit(cinfo)) 535 if (nfs_reqs_to_commit(cinfo))
579 break; 536 break;
580 default: 537 default:
581 data->args.stable = NFS_FILE_SYNC; 538 hdr->args.stable = NFS_FILE_SYNC;
582 } 539 }
583 540
584 data->res.fattr = &data->fattr; 541 hdr->res.fattr = &hdr->fattr;
585 data->res.count = count; 542 hdr->res.count = count;
586 data->res.eof = 0; 543 hdr->res.eof = 0;
587 data->res.verf = &data->verf; 544 hdr->res.verf = &hdr->verf;
588 nfs_fattr_init(&data->fattr); 545 nfs_fattr_init(&hdr->fattr);
589} 546}
590 547
591/** 548/**
592 * nfs_pgio_prepare - Prepare pageio data to go over the wire 549 * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
593 * @task: The current task 550 * @task: The current task
594 * @calldata: pageio data to prepare 551 * @calldata: pageio header to prepare
595 */ 552 */
596static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) 553static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
597{ 554{
598 struct nfs_pgio_data *data = calldata; 555 struct nfs_pgio_header *hdr = calldata;
599 int err; 556 int err;
600 err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); 557 err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
601 if (err) 558 if (err)
602 rpc_exit(task, err); 559 rpc_exit(task, err);
603} 560}
604 561
605int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, 562int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
606 const struct rpc_call_ops *call_ops, int how, int flags) 563 const struct rpc_call_ops *call_ops, int how, int flags)
607{ 564{
608 struct rpc_task *task; 565 struct rpc_task *task;
609 struct rpc_message msg = { 566 struct rpc_message msg = {
610 .rpc_argp = &data->args, 567 .rpc_argp = &hdr->args,
611 .rpc_resp = &data->res, 568 .rpc_resp = &hdr->res,
612 .rpc_cred = data->header->cred, 569 .rpc_cred = hdr->cred,
613 }; 570 };
614 struct rpc_task_setup task_setup_data = { 571 struct rpc_task_setup task_setup_data = {
615 .rpc_client = clnt, 572 .rpc_client = clnt,
616 .task = &data->task, 573 .task = &hdr->task,
617 .rpc_message = &msg, 574 .rpc_message = &msg,
618 .callback_ops = call_ops, 575 .callback_ops = call_ops,
619 .callback_data = data, 576 .callback_data = hdr,
620 .workqueue = nfsiod_workqueue, 577 .workqueue = nfsiod_workqueue,
621 .flags = RPC_TASK_ASYNC | flags, 578 .flags = RPC_TASK_ASYNC | flags,
622 }; 579 };
623 int ret = 0; 580 int ret = 0;
624 581
625 data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); 582 hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
626 583
627 dprintk("NFS: %5u initiated pgio call " 584 dprintk("NFS: %5u initiated pgio call "
628 "(req %s/%llu, %u bytes @ offset %llu)\n", 585 "(req %s/%llu, %u bytes @ offset %llu)\n",
629 data->task.tk_pid, 586 hdr->task.tk_pid,
630 data->header->inode->i_sb->s_id, 587 hdr->inode->i_sb->s_id,
631 (unsigned long long)NFS_FILEID(data->header->inode), 588 (unsigned long long)NFS_FILEID(hdr->inode),
632 data->args.count, 589 hdr->args.count,
633 (unsigned long long)data->args.offset); 590 (unsigned long long)hdr->args.offset);
634 591
635 task = rpc_run_task(&task_setup_data); 592 task = rpc_run_task(&task_setup_data);
636 if (IS_ERR(task)) { 593 if (IS_ERR(task)) {
@@ -657,22 +614,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
657 struct nfs_pgio_header *hdr) 614 struct nfs_pgio_header *hdr)
658{ 615{
659 set_bit(NFS_IOHDR_REDO, &hdr->flags); 616 set_bit(NFS_IOHDR_REDO, &hdr->flags);
660 nfs_pgio_data_release(hdr->data); 617 nfs_pgio_data_destroy(hdr);
661 hdr->data = NULL; 618 hdr->completion_ops->completion(hdr);
662 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 619 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
663 return -ENOMEM; 620 return -ENOMEM;
664} 621}
665 622
666/** 623/**
667 * nfs_pgio_release - Release pageio data 624 * nfs_pgio_release - Release pageio data
668 * @calldata: The pageio data to release 625 * @calldata: The pageio header to release
669 */ 626 */
670static void nfs_pgio_release(void *calldata) 627static void nfs_pgio_release(void *calldata)
671{ 628{
672 struct nfs_pgio_data *data = calldata; 629 struct nfs_pgio_header *hdr = calldata;
673 if (data->header->rw_ops->rw_release) 630 if (hdr->rw_ops->rw_release)
674 data->header->rw_ops->rw_release(data); 631 hdr->rw_ops->rw_release(hdr);
675 nfs_pgio_data_release(data); 632 nfs_pgio_data_destroy(hdr);
633 hdr->completion_ops->completion(hdr);
676} 634}
677 635
678/** 636/**
@@ -713,22 +671,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
713/** 671/**
714 * nfs_pgio_result - Basic pageio error handling 672 * nfs_pgio_result - Basic pageio error handling
715 * @task: The task that ran 673 * @task: The task that ran
716 * @calldata: Pageio data to check 674 * @calldata: Pageio header to check
717 */ 675 */
718static void nfs_pgio_result(struct rpc_task *task, void *calldata) 676static void nfs_pgio_result(struct rpc_task *task, void *calldata)
719{ 677{
720 struct nfs_pgio_data *data = calldata; 678 struct nfs_pgio_header *hdr = calldata;
721 struct inode *inode = data->header->inode; 679 struct inode *inode = hdr->inode;
722 680
723 dprintk("NFS: %s: %5u, (status %d)\n", __func__, 681 dprintk("NFS: %s: %5u, (status %d)\n", __func__,
724 task->tk_pid, task->tk_status); 682 task->tk_pid, task->tk_status);
725 683
726 if (data->header->rw_ops->rw_done(task, data, inode) != 0) 684 if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
727 return; 685 return;
728 if (task->tk_status < 0) 686 if (task->tk_status < 0)
729 nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); 687 nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
730 else 688 else
731 data->header->rw_ops->rw_result(task, data); 689 hdr->rw_ops->rw_result(task, hdr);
732} 690}
733 691
734/* 692/*
@@ -744,17 +702,16 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
744{ 702{
745 struct nfs_page *req; 703 struct nfs_page *req;
746 struct page **pages; 704 struct page **pages;
747 struct nfs_pgio_data *data;
748 struct list_head *head = &desc->pg_list; 705 struct list_head *head = &desc->pg_list;
749 struct nfs_commit_info cinfo; 706 struct nfs_commit_info cinfo;
707 unsigned int pagecount;
750 708
751 data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, 709 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
752 desc->pg_count)); 710 if (!nfs_pgarray_set(&hdr->page_array, pagecount))
753 if (!data)
754 return nfs_pgio_error(desc, hdr); 711 return nfs_pgio_error(desc, hdr);
755 712
756 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); 713 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
757 pages = data->pages.pagevec; 714 pages = hdr->page_array.pagevec;
758 while (!list_empty(head)) { 715 while (!list_empty(head)) {
759 req = nfs_list_entry(head->next); 716 req = nfs_list_entry(head->next);
760 nfs_list_remove_request(req); 717 nfs_list_remove_request(req);
@@ -767,8 +724,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
767 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 724 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
768 725
769 /* Set up the argument struct */ 726 /* Set up the argument struct */
770 nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); 727 nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
771 hdr->data = data;
772 desc->pg_rpc_callops = &nfs_pgio_common_ops; 728 desc->pg_rpc_callops = &nfs_pgio_common_ops;
773 return 0; 729 return 0;
774} 730}
@@ -776,25 +732,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
776 732
777static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) 733static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
778{ 734{
779 struct nfs_rw_header *rw_hdr;
780 struct nfs_pgio_header *hdr; 735 struct nfs_pgio_header *hdr;
781 int ret; 736 int ret;
782 737
783 rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); 738 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
784 if (!rw_hdr) { 739 if (!hdr) {
785 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 740 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
786 return -ENOMEM; 741 return -ENOMEM;
787 } 742 }
788 hdr = &rw_hdr->header; 743 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
789 nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
790 atomic_inc(&hdr->refcnt);
791 ret = nfs_generic_pgio(desc, hdr); 744 ret = nfs_generic_pgio(desc, hdr);
792 if (ret == 0) 745 if (ret == 0)
793 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), 746 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
794 hdr->data, desc->pg_rpc_callops, 747 hdr, desc->pg_rpc_callops,
795 desc->pg_ioflags, 0); 748 desc->pg_ioflags, 0);
796 if (atomic_dec_and_test(&hdr->refcnt))
797 hdr->completion_ops->completion(hdr);
798 return ret; 749 return ret;
799} 750}
800 751
@@ -907,8 +858,13 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
907 struct nfs_page *subreq; 858 struct nfs_page *subreq;
908 unsigned int bytes_left = 0; 859 unsigned int bytes_left = 0;
909 unsigned int offset, pgbase; 860 unsigned int offset, pgbase;
861 int ret;
910 862
911 nfs_page_group_lock(req); 863 ret = nfs_page_group_lock(req, false);
864 if (ret < 0) {
865 desc->pg_error = ret;
866 return 0;
867 }
912 868
913 subreq = req; 869 subreq = req;
914 bytes_left = subreq->wb_bytes; 870 bytes_left = subreq->wb_bytes;
@@ -930,7 +886,11 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
930 if (desc->pg_recoalesce) 886 if (desc->pg_recoalesce)
931 return 0; 887 return 0;
932 /* retry add_request for this subreq */ 888 /* retry add_request for this subreq */
933 nfs_page_group_lock(req); 889 ret = nfs_page_group_lock(req, false);
890 if (ret < 0) {
891 desc->pg_error = ret;
892 return 0;
893 }
934 continue; 894 continue;
935 } 895 }
936 896
@@ -1005,7 +965,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1005 } while (ret); 965 } while (ret);
1006 return ret; 966 return ret;
1007} 967}
1008EXPORT_SYMBOL_GPL(nfs_pageio_add_request); 968
969/*
970 * nfs_pageio_resend - Transfer requests to new descriptor and resend
971 * @hdr - the pgio header to move request from
972 * @desc - the pageio descriptor to add requests to
973 *
974 * Try to move each request (nfs_page) from @hdr to @desc then attempt
975 * to send them.
976 *
977 * Returns 0 on success and < 0 on error.
978 */
979int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
980 struct nfs_pgio_header *hdr)
981{
982 LIST_HEAD(failed);
983
984 desc->pg_dreq = hdr->dreq;
985 while (!list_empty(&hdr->pages)) {
986 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
987
988 nfs_list_remove_request(req);
989 if (!nfs_pageio_add_request(desc, req))
990 nfs_list_add_request(req, &failed);
991 }
992 nfs_pageio_complete(desc);
993 if (!list_empty(&failed)) {
994 list_move(&failed, &hdr->pages);
995 return -EIO;
996 }
997 return 0;
998}
999EXPORT_SYMBOL_GPL(nfs_pageio_resend);
1009 1000
1010/** 1001/**
1011 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 1002 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1012,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1021 break; 1012 break;
1022 } 1013 }
1023} 1014}
1024EXPORT_SYMBOL_GPL(nfs_pageio_complete);
1025 1015
1026/** 1016/**
1027 * nfs_pageio_cond_complete - Conditional I/O completion 1017 * nfs_pageio_cond_complete - Conditional I/O completion
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a8914b335617..a3851debf8a2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,6 +361,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
361} 361}
362EXPORT_SYMBOL_GPL(pnfs_put_lseg); 362EXPORT_SYMBOL_GPL(pnfs_put_lseg);
363 363
364static void pnfs_put_lseg_async_work(struct work_struct *work)
365{
366 struct pnfs_layout_segment *lseg;
367
368 lseg = container_of(work, struct pnfs_layout_segment, pls_work);
369
370 pnfs_put_lseg(lseg);
371}
372
373void
374pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
375{
376 INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work);
377 schedule_work(&lseg->pls_work);
378}
379EXPORT_SYMBOL_GPL(pnfs_put_lseg_async);
380
364static u64 381static u64
365end_offset(u64 start, u64 len) 382end_offset(u64 start, u64 len)
366{ 383{
@@ -1470,41 +1487,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1470} 1487}
1471EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1488EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1472 1489
1473int pnfs_write_done_resend_to_mds(struct inode *inode, 1490int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
1474 struct list_head *head,
1475 const struct nfs_pgio_completion_ops *compl_ops,
1476 struct nfs_direct_req *dreq)
1477{ 1491{
1478 struct nfs_pageio_descriptor pgio; 1492 struct nfs_pageio_descriptor pgio;
1479 LIST_HEAD(failed);
1480 1493
1481 /* Resend all requests through the MDS */ 1494 /* Resend all requests through the MDS */
1482 nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); 1495 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
1483 pgio.pg_dreq = dreq; 1496 hdr->completion_ops);
1484 while (!list_empty(head)) { 1497 return nfs_pageio_resend(&pgio, hdr);
1485 struct nfs_page *req = nfs_list_entry(head->next);
1486
1487 nfs_list_remove_request(req);
1488 if (!nfs_pageio_add_request(&pgio, req))
1489 nfs_list_add_request(req, &failed);
1490 }
1491 nfs_pageio_complete(&pgio);
1492
1493 if (!list_empty(&failed)) {
1494 /* For some reason our attempt to resend pages. Mark the
1495 * overall send request as having failed, and let
1496 * nfs_writeback_release_full deal with the error.
1497 */
1498 list_move(&failed, head);
1499 return -EIO;
1500 }
1501 return 0;
1502} 1498}
1503EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1499EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
1504 1500
1505static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) 1501static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
1506{ 1502{
1507 struct nfs_pgio_header *hdr = data->header;
1508 1503
1509 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 1504 dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1510 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1505 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1507,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
1512 pnfs_return_layout(hdr->inode); 1507 pnfs_return_layout(hdr->inode);
1513 } 1508 }
1514 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1509 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1515 data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 1510 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
1516 &hdr->pages,
1517 hdr->completion_ops,
1518 hdr->dreq);
1519} 1511}
1520 1512
1521/* 1513/*
1522 * Called by non rpc-based layout drivers 1514 * Called by non rpc-based layout drivers
1523 */ 1515 */
1524void pnfs_ld_write_done(struct nfs_pgio_data *data) 1516void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
1525{ 1517{
1526 struct nfs_pgio_header *hdr = data->header; 1518 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
1527
1528 trace_nfs4_pnfs_write(data, hdr->pnfs_error);
1529 if (!hdr->pnfs_error) { 1519 if (!hdr->pnfs_error) {
1530 pnfs_set_layoutcommit(data); 1520 pnfs_set_layoutcommit(hdr);
1531 hdr->mds_ops->rpc_call_done(&data->task, data); 1521 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1532 } else 1522 } else
1533 pnfs_ld_handle_write_error(data); 1523 pnfs_ld_handle_write_error(hdr);
1534 hdr->mds_ops->rpc_release(data); 1524 hdr->mds_ops->rpc_release(hdr);
1535} 1525}
1536EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1526EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1537 1527
1538static void 1528static void
1539pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1529pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1540 struct nfs_pgio_data *data) 1530 struct nfs_pgio_header *hdr)
1541{ 1531{
1542 struct nfs_pgio_header *hdr = data->header;
1543
1544 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1532 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1545 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1533 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1546 nfs_pageio_reset_write_mds(desc); 1534 nfs_pageio_reset_write_mds(desc);
1547 desc->pg_recoalesce = 1; 1535 desc->pg_recoalesce = 1;
1548 } 1536 }
1549 nfs_pgio_data_release(data); 1537 nfs_pgio_data_destroy(hdr);
1550} 1538}
1551 1539
1552static enum pnfs_try_status 1540static enum pnfs_try_status
1553pnfs_try_to_write_data(struct nfs_pgio_data *wdata, 1541pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
1554 const struct rpc_call_ops *call_ops, 1542 const struct rpc_call_ops *call_ops,
1555 struct pnfs_layout_segment *lseg, 1543 struct pnfs_layout_segment *lseg,
1556 int how) 1544 int how)
1557{ 1545{
1558 struct nfs_pgio_header *hdr = wdata->header;
1559 struct inode *inode = hdr->inode; 1546 struct inode *inode = hdr->inode;
1560 enum pnfs_try_status trypnfs; 1547 enum pnfs_try_status trypnfs;
1561 struct nfs_server *nfss = NFS_SERVER(inode); 1548 struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1550,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
1563 hdr->mds_ops = call_ops; 1550 hdr->mds_ops = call_ops;
1564 1551
1565 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1552 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1566 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1553 inode->i_ino, hdr->args.count, hdr->args.offset, how);
1567 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); 1554 trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
1568 if (trypnfs != PNFS_NOT_ATTEMPTED) 1555 if (trypnfs != PNFS_NOT_ATTEMPTED)
1569 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 1556 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
1570 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1557 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1562,105 @@ static void
1575pnfs_do_write(struct nfs_pageio_descriptor *desc, 1562pnfs_do_write(struct nfs_pageio_descriptor *desc,
1576 struct nfs_pgio_header *hdr, int how) 1563 struct nfs_pgio_header *hdr, int how)
1577{ 1564{
1578 struct nfs_pgio_data *data = hdr->data;
1579 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1565 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1580 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1566 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1581 enum pnfs_try_status trypnfs; 1567 enum pnfs_try_status trypnfs;
1582 1568
1583 desc->pg_lseg = NULL; 1569 desc->pg_lseg = NULL;
1584 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); 1570 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1585 if (trypnfs == PNFS_NOT_ATTEMPTED) 1571 if (trypnfs == PNFS_NOT_ATTEMPTED)
1586 pnfs_write_through_mds(desc, data); 1572 pnfs_write_through_mds(desc, hdr);
1587 pnfs_put_lseg(lseg); 1573 pnfs_put_lseg(lseg);
1588} 1574}
1589 1575
1590static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1576static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1591{ 1577{
1592 pnfs_put_lseg(hdr->lseg); 1578 pnfs_put_lseg(hdr->lseg);
1593 nfs_rw_header_free(hdr); 1579 nfs_pgio_header_free(hdr);
1594} 1580}
1595EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1581EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1596 1582
1597int 1583int
1598pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1584pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1599{ 1585{
1600 struct nfs_rw_header *whdr;
1601 struct nfs_pgio_header *hdr; 1586 struct nfs_pgio_header *hdr;
1602 int ret; 1587 int ret;
1603 1588
1604 whdr = nfs_rw_header_alloc(desc->pg_rw_ops); 1589 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1605 if (!whdr) { 1590 if (!hdr) {
1606 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1591 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1607 pnfs_put_lseg(desc->pg_lseg); 1592 pnfs_put_lseg(desc->pg_lseg);
1608 desc->pg_lseg = NULL; 1593 desc->pg_lseg = NULL;
1609 return -ENOMEM; 1594 return -ENOMEM;
1610 } 1595 }
1611 hdr = &whdr->header;
1612 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1596 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1613 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1597 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1614 atomic_inc(&hdr->refcnt);
1615 ret = nfs_generic_pgio(desc, hdr); 1598 ret = nfs_generic_pgio(desc, hdr);
1616 if (ret != 0) { 1599 if (ret != 0) {
1617 pnfs_put_lseg(desc->pg_lseg); 1600 pnfs_put_lseg(desc->pg_lseg);
1618 desc->pg_lseg = NULL; 1601 desc->pg_lseg = NULL;
1619 } else 1602 } else
1620 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1603 pnfs_do_write(desc, hdr, desc->pg_ioflags);
1621 if (atomic_dec_and_test(&hdr->refcnt))
1622 hdr->completion_ops->completion(hdr);
1623 return ret; 1604 return ret;
1624} 1605}
1625EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1606EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1626 1607
1627int pnfs_read_done_resend_to_mds(struct inode *inode, 1608int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
1628 struct list_head *head,
1629 const struct nfs_pgio_completion_ops *compl_ops,
1630 struct nfs_direct_req *dreq)
1631{ 1609{
1632 struct nfs_pageio_descriptor pgio; 1610 struct nfs_pageio_descriptor pgio;
1633 LIST_HEAD(failed);
1634 1611
1635 /* Resend all requests through the MDS */ 1612 /* Resend all requests through the MDS */
1636 nfs_pageio_init_read(&pgio, inode, true, compl_ops); 1613 nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
1637 pgio.pg_dreq = dreq; 1614 return nfs_pageio_resend(&pgio, hdr);
1638 while (!list_empty(head)) {
1639 struct nfs_page *req = nfs_list_entry(head->next);
1640
1641 nfs_list_remove_request(req);
1642 if (!nfs_pageio_add_request(&pgio, req))
1643 nfs_list_add_request(req, &failed);
1644 }
1645 nfs_pageio_complete(&pgio);
1646
1647 if (!list_empty(&failed)) {
1648 list_move(&failed, head);
1649 return -EIO;
1650 }
1651 return 0;
1652} 1615}
1653EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 1616EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
1654 1617
1655static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) 1618static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
1656{ 1619{
1657 struct nfs_pgio_header *hdr = data->header;
1658
1659 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 1620 dprintk("pnfs read error = %d\n", hdr->pnfs_error);
1660 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1621 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1661 PNFS_LAYOUTRET_ON_ERROR) { 1622 PNFS_LAYOUTRET_ON_ERROR) {
1662 pnfs_return_layout(hdr->inode); 1623 pnfs_return_layout(hdr->inode);
1663 } 1624 }
1664 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1625 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1665 data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 1626 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
1666 &hdr->pages,
1667 hdr->completion_ops,
1668 hdr->dreq);
1669} 1627}
1670 1628
1671/* 1629/*
1672 * Called by non rpc-based layout drivers 1630 * Called by non rpc-based layout drivers
1673 */ 1631 */
1674void pnfs_ld_read_done(struct nfs_pgio_data *data) 1632void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
1675{ 1633{
1676 struct nfs_pgio_header *hdr = data->header; 1634 trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
1677
1678 trace_nfs4_pnfs_read(data, hdr->pnfs_error);
1679 if (likely(!hdr->pnfs_error)) { 1635 if (likely(!hdr->pnfs_error)) {
1680 __nfs4_read_done_cb(data); 1636 __nfs4_read_done_cb(hdr);
1681 hdr->mds_ops->rpc_call_done(&data->task, data); 1637 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1682 } else 1638 } else
1683 pnfs_ld_handle_read_error(data); 1639 pnfs_ld_handle_read_error(hdr);
1684 hdr->mds_ops->rpc_release(data); 1640 hdr->mds_ops->rpc_release(hdr);
1685} 1641}
1686EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1642EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1687 1643
1688static void 1644static void
1689pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1645pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1690 struct nfs_pgio_data *data) 1646 struct nfs_pgio_header *hdr)
1691{ 1647{
1692 struct nfs_pgio_header *hdr = data->header;
1693
1694 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1648 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1695 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1649 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1696 nfs_pageio_reset_read_mds(desc); 1650 nfs_pageio_reset_read_mds(desc);
1697 desc->pg_recoalesce = 1; 1651 desc->pg_recoalesce = 1;
1698 } 1652 }
1699 nfs_pgio_data_release(data); 1653 nfs_pgio_data_destroy(hdr);
1700} 1654}
1701 1655
1702/* 1656/*
1703 * Call the appropriate parallel I/O subsystem read function. 1657 * Call the appropriate parallel I/O subsystem read function.
1704 */ 1658 */
1705static enum pnfs_try_status 1659static enum pnfs_try_status
1706pnfs_try_to_read_data(struct nfs_pgio_data *rdata, 1660pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
1707 const struct rpc_call_ops *call_ops, 1661 const struct rpc_call_ops *call_ops,
1708 struct pnfs_layout_segment *lseg) 1662 struct pnfs_layout_segment *lseg)
1709{ 1663{
1710 struct nfs_pgio_header *hdr = rdata->header;
1711 struct inode *inode = hdr->inode; 1664 struct inode *inode = hdr->inode;
1712 struct nfs_server *nfss = NFS_SERVER(inode); 1665 struct nfs_server *nfss = NFS_SERVER(inode);
1713 enum pnfs_try_status trypnfs; 1666 enum pnfs_try_status trypnfs;
@@ -1715,9 +1668,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
1715 hdr->mds_ops = call_ops; 1668 hdr->mds_ops = call_ops;
1716 1669
1717 dprintk("%s: Reading ino:%lu %u@%llu\n", 1670 dprintk("%s: Reading ino:%lu %u@%llu\n",
1718 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1671 __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
1719 1672
1720 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); 1673 trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
1721 if (trypnfs != PNFS_NOT_ATTEMPTED) 1674 if (trypnfs != PNFS_NOT_ATTEMPTED)
1722 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 1675 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
1723 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1676 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1680,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
1727static void 1680static void
1728pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 1681pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
1729{ 1682{
1730 struct nfs_pgio_data *data = hdr->data;
1731 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1683 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1732 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1684 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1733 enum pnfs_try_status trypnfs; 1685 enum pnfs_try_status trypnfs;
1734 1686
1735 desc->pg_lseg = NULL; 1687 desc->pg_lseg = NULL;
1736 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); 1688 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
1737 if (trypnfs == PNFS_NOT_ATTEMPTED) 1689 if (trypnfs == PNFS_NOT_ATTEMPTED)
1738 pnfs_read_through_mds(desc, data); 1690 pnfs_read_through_mds(desc, hdr);
1739 pnfs_put_lseg(lseg); 1691 pnfs_put_lseg(lseg);
1740} 1692}
1741 1693
1742static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1694static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1743{ 1695{
1744 pnfs_put_lseg(hdr->lseg); 1696 pnfs_put_lseg(hdr->lseg);
1745 nfs_rw_header_free(hdr); 1697 nfs_pgio_header_free(hdr);
1746} 1698}
1747EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1699EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1748 1700
1749int 1701int
1750pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 1702pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1751{ 1703{
1752 struct nfs_rw_header *rhdr;
1753 struct nfs_pgio_header *hdr; 1704 struct nfs_pgio_header *hdr;
1754 int ret; 1705 int ret;
1755 1706
1756 rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); 1707 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1757 if (!rhdr) { 1708 if (!hdr) {
1758 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1709 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1759 ret = -ENOMEM; 1710 ret = -ENOMEM;
1760 pnfs_put_lseg(desc->pg_lseg); 1711 pnfs_put_lseg(desc->pg_lseg);
1761 desc->pg_lseg = NULL; 1712 desc->pg_lseg = NULL;
1762 return ret; 1713 return ret;
1763 } 1714 }
1764 hdr = &rhdr->header;
1765 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1715 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1766 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1716 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1767 atomic_inc(&hdr->refcnt);
1768 ret = nfs_generic_pgio(desc, hdr); 1717 ret = nfs_generic_pgio(desc, hdr);
1769 if (ret != 0) { 1718 if (ret != 0) {
1770 pnfs_put_lseg(desc->pg_lseg); 1719 pnfs_put_lseg(desc->pg_lseg);
1771 desc->pg_lseg = NULL; 1720 desc->pg_lseg = NULL;
1772 } else 1721 } else
1773 pnfs_do_read(desc, hdr); 1722 pnfs_do_read(desc, hdr);
1774 if (atomic_dec_and_test(&hdr->refcnt))
1775 hdr->completion_ops->completion(hdr);
1776 return ret; 1723 return ret;
1777} 1724}
1778EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1725EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1767,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1820EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1767EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1821 1768
1822void 1769void
1823pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) 1770pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
1824{ 1771{
1825 struct nfs_pgio_header *hdr = wdata->header;
1826 struct inode *inode = hdr->inode; 1772 struct inode *inode = hdr->inode;
1827 struct nfs_inode *nfsi = NFS_I(inode); 1773 struct nfs_inode *nfsi = NFS_I(inode);
1828 loff_t end_pos = wdata->mds_offset + wdata->res.count; 1774 loff_t end_pos = hdr->mds_offset + hdr->res.count;
1829 bool mark_as_dirty = false; 1775 bool mark_as_dirty = false;
1830 1776
1831 spin_lock(&inode->i_lock); 1777 spin_lock(&inode->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4fb309a2b4c4..aca3dff5dae6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
32 32
33#include <linux/nfs_fs.h> 33#include <linux/nfs_fs.h>
34#include <linux/nfs_page.h> 34#include <linux/nfs_page.h>
35#include <linux/workqueue.h>
35 36
36enum { 37enum {
37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
46 atomic_t pls_refcount; 47 atomic_t pls_refcount;
47 unsigned long pls_flags; 48 unsigned long pls_flags;
48 struct pnfs_layout_hdr *pls_layout; 49 struct pnfs_layout_hdr *pls_layout;
50 struct work_struct pls_work;
49}; 51};
50 52
51enum pnfs_try_status { 53enum pnfs_try_status {
@@ -104,6 +106,8 @@ struct pnfs_layoutdriver_type {
104 int max); 106 int max);
105 void (*recover_commit_reqs) (struct list_head *list, 107 void (*recover_commit_reqs) (struct list_head *list,
106 struct nfs_commit_info *cinfo); 108 struct nfs_commit_info *cinfo);
109 struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
110 struct page *page);
107 int (*commit_pagelist)(struct inode *inode, 111 int (*commit_pagelist)(struct inode *inode,
108 struct list_head *mds_pages, 112 struct list_head *mds_pages,
109 int how, 113 int how,
@@ -113,8 +117,8 @@ struct pnfs_layoutdriver_type {
113 * Return PNFS_ATTEMPTED to indicate the layout code has attempted 117 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
114 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS 118 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
115 */ 119 */
116 enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); 120 enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
117 enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); 121 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
118 122
119 void (*free_deviceid_node) (struct nfs4_deviceid_node *); 123 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
120 124
@@ -179,6 +183,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
179/* pnfs.c */ 183/* pnfs.c */
180void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); 184void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
181void pnfs_put_lseg(struct pnfs_layout_segment *lseg); 185void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
186void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg);
182 187
183void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); 188void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
184void unset_pnfs_layoutdriver(struct nfs_server *); 189void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +218,13 @@ bool pnfs_roc(struct inode *ino);
213void pnfs_roc_release(struct inode *ino); 218void pnfs_roc_release(struct inode *ino);
214void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 219void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
215bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 220bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
216void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); 221void pnfs_set_layoutcommit(struct nfs_pgio_header *);
217void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 222void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
218int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 223int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
219int _pnfs_return_layout(struct inode *); 224int _pnfs_return_layout(struct inode *);
220int pnfs_commit_and_return_layout(struct inode *); 225int pnfs_commit_and_return_layout(struct inode *);
221void pnfs_ld_write_done(struct nfs_pgio_data *); 226void pnfs_ld_write_done(struct nfs_pgio_header *);
222void pnfs_ld_read_done(struct nfs_pgio_data *); 227void pnfs_ld_read_done(struct nfs_pgio_header *);
223struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 228struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
224 struct nfs_open_context *ctx, 229 struct nfs_open_context *ctx,
225 loff_t pos, 230 loff_t pos,
@@ -228,12 +233,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
228 gfp_t gfp_flags); 233 gfp_t gfp_flags);
229 234
230void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); 235void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
231int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, 236int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
232 const struct nfs_pgio_completion_ops *compl_ops, 237int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
233 struct nfs_direct_req *dreq);
234int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
235 const struct nfs_pgio_completion_ops *compl_ops,
236 struct nfs_direct_req *dreq);
237struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); 238struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
238 239
239/* nfs4_deviceid_flags */ 240/* nfs4_deviceid_flags */
@@ -345,6 +346,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
345 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); 346 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
346} 347}
347 348
349static inline struct nfs_page *
350pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
351 struct page *page)
352{
353 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
354
355 if (ld == NULL || ld->search_commit_reqs == NULL)
356 return NULL;
357 return ld->search_commit_reqs(cinfo, page);
358}
359
348/* Should the pNFS client commit and return the layout upon a setattr */ 360/* Should the pNFS client commit and return the layout upon a setattr */
349static inline bool 361static inline bool
350pnfs_ld_layoutret_on_setattr(struct inode *inode) 362pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -410,6 +422,10 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
410{ 422{
411} 423}
412 424
425static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
426{
427}
428
413static inline int pnfs_return_layout(struct inode *ino) 429static inline int pnfs_return_layout(struct inode *ino)
414{ 430{
415 return 0; 431 return 0;
@@ -496,6 +512,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
496{ 512{
497} 513}
498 514
515static inline struct nfs_page *
516pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
517 struct page *page)
518{
519 return NULL;
520}
521
499static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) 522static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
500{ 523{
501 return 0; 524 return 0;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c171ce1a8a30..b09cc23d6f43 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
578 return 0; 578 return 0;
579} 579}
580 580
581static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) 581static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
582{ 582{
583 struct inode *inode = data->header->inode; 583 struct inode *inode = hdr->inode;
584 584
585 nfs_invalidate_atime(inode); 585 nfs_invalidate_atime(inode);
586 if (task->tk_status >= 0) { 586 if (task->tk_status >= 0) {
587 nfs_refresh_inode(inode, data->res.fattr); 587 nfs_refresh_inode(inode, hdr->res.fattr);
588 /* Emulate the eof flag, which isn't normally needed in NFSv2 588 /* Emulate the eof flag, which isn't normally needed in NFSv2
589 * as it is guaranteed to always return the file attributes 589 * as it is guaranteed to always return the file attributes
590 */ 590 */
591 if (data->args.offset + data->res.count >= data->res.fattr->size) 591 if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
592 data->res.eof = 1; 592 hdr->res.eof = 1;
593 } 593 }
594 return 0; 594 return 0;
595} 595}
596 596
597static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 597static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
598 struct rpc_message *msg)
598{ 599{
599 msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; 600 msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
600} 601}
601 602
602static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) 603static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
604 struct nfs_pgio_header *hdr)
603{ 605{
604 rpc_call_start(task); 606 rpc_call_start(task);
605 return 0; 607 return 0;
606} 608}
607 609
608static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) 610static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
609{ 611{
610 struct inode *inode = data->header->inode; 612 struct inode *inode = hdr->inode;
611 613
612 if (task->tk_status >= 0) 614 if (task->tk_status >= 0)
613 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 615 nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
614 return 0; 616 return 0;
615} 617}
616 618
617static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 619static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
620 struct rpc_message *msg)
618{ 621{
619 /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ 622 /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
620 data->args.stable = NFS_FILE_SYNC; 623 hdr->args.stable = NFS_FILE_SYNC;
621 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; 624 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
622} 625}
623 626
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e818a475ca64..beff2769c5c5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
33 33
34static struct kmem_cache *nfs_rdata_cachep; 34static struct kmem_cache *nfs_rdata_cachep;
35 35
36static struct nfs_rw_header *nfs_readhdr_alloc(void) 36static struct nfs_pgio_header *nfs_readhdr_alloc(void)
37{ 37{
38 return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); 38 return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
39} 39}
40 40
41static void nfs_readhdr_free(struct nfs_rw_header *rhdr) 41static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
42{ 42{
43 kmem_cache_free(nfs_rdata_cachep, rhdr); 43 kmem_cache_free(nfs_rdata_cachep, rhdr);
44} 44}
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
115 115
116 unlock_page(req->wb_page); 116 unlock_page(req->wb_page);
117 } 117 }
118
119 dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
120 req->wb_context->dentry->d_inode->i_sb->s_id,
121 (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
122 req->wb_bytes,
123 (long long)req_offset(req));
124 nfs_release_request(req); 118 nfs_release_request(req);
125} 119}
126 120
@@ -172,14 +166,15 @@ out:
172 hdr->release(hdr); 166 hdr->release(hdr);
173} 167}
174 168
175static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, 169static void nfs_initiate_read(struct nfs_pgio_header *hdr,
170 struct rpc_message *msg,
176 struct rpc_task_setup *task_setup_data, int how) 171 struct rpc_task_setup *task_setup_data, int how)
177{ 172{
178 struct inode *inode = data->header->inode; 173 struct inode *inode = hdr->inode;
179 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 174 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
180 175
181 task_setup_data->flags |= swap_flags; 176 task_setup_data->flags |= swap_flags;
182 NFS_PROTO(inode)->read_setup(data, msg); 177 NFS_PROTO(inode)->read_setup(hdr, msg);
183} 178}
184 179
185static void 180static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
203 * This is the callback from RPC telling us whether a reply was 198 * This is the callback from RPC telling us whether a reply was
204 * received or some error occurred (timeout or socket shutdown). 199 * received or some error occurred (timeout or socket shutdown).
205 */ 200 */
206static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, 201static int nfs_readpage_done(struct rpc_task *task,
202 struct nfs_pgio_header *hdr,
207 struct inode *inode) 203 struct inode *inode)
208{ 204{
209 int status = NFS_PROTO(inode)->read_done(task, data); 205 int status = NFS_PROTO(inode)->read_done(task, hdr);
210 if (status != 0) 206 if (status != 0)
211 return status; 207 return status;
212 208
213 nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count); 209 nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
214 210
215 if (task->tk_status == -ESTALE) { 211 if (task->tk_status == -ESTALE) {
216 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 212 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
219 return 0; 215 return 0;
220} 216}
221 217
222static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) 218static void nfs_readpage_retry(struct rpc_task *task,
219 struct nfs_pgio_header *hdr)
223{ 220{
224 struct nfs_pgio_args *argp = &data->args; 221 struct nfs_pgio_args *argp = &hdr->args;
225 struct nfs_pgio_res *resp = &data->res; 222 struct nfs_pgio_res *resp = &hdr->res;
226 223
227 /* This is a short read! */ 224 /* This is a short read! */
228 nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); 225 nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
229 /* Has the server at least made some progress? */ 226 /* Has the server at least made some progress? */
230 if (resp->count == 0) { 227 if (resp->count == 0) {
231 nfs_set_pgio_error(data->header, -EIO, argp->offset); 228 nfs_set_pgio_error(hdr, -EIO, argp->offset);
232 return; 229 return;
233 } 230 }
234 /* Yes, so retry the read at the end of the data */ 231 /* Yes, so retry the read at the end of the hdr */
235 data->mds_offset += resp->count; 232 hdr->mds_offset += resp->count;
236 argp->offset += resp->count; 233 argp->offset += resp->count;
237 argp->pgbase += resp->count; 234 argp->pgbase += resp->count;
238 argp->count -= resp->count; 235 argp->count -= resp->count;
239 rpc_restart_call_prepare(task); 236 rpc_restart_call_prepare(task);
240} 237}
241 238
242static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) 239static void nfs_readpage_result(struct rpc_task *task,
240 struct nfs_pgio_header *hdr)
243{ 241{
244 struct nfs_pgio_header *hdr = data->header; 242 if (hdr->res.eof) {
245
246 if (data->res.eof) {
247 loff_t bound; 243 loff_t bound;
248 244
249 bound = data->args.offset + data->res.count; 245 bound = hdr->args.offset + hdr->res.count;
250 spin_lock(&hdr->lock); 246 spin_lock(&hdr->lock);
251 if (bound < hdr->io_start + hdr->good_bytes) { 247 if (bound < hdr->io_start + hdr->good_bytes) {
252 set_bit(NFS_IOHDR_EOF, &hdr->flags); 248 set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
254 hdr->good_bytes = bound - hdr->io_start; 250 hdr->good_bytes = bound - hdr->io_start;
255 } 251 }
256 spin_unlock(&hdr->lock); 252 spin_unlock(&hdr->lock);
257 } else if (data->res.count != data->args.count) 253 } else if (hdr->res.count != hdr->args.count)
258 nfs_readpage_retry(task, data); 254 nfs_readpage_retry(task, hdr);
259} 255}
260 256
261/* 257/*
@@ -404,7 +400,7 @@ out:
404int __init nfs_init_readpagecache(void) 400int __init nfs_init_readpagecache(void)
405{ 401{
406 nfs_rdata_cachep = kmem_cache_create("nfs_read_data", 402 nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
407 sizeof(struct nfs_rw_header), 403 sizeof(struct nfs_pgio_header),
408 0, SLAB_HWCACHE_ALIGN, 404 0, SLAB_HWCACHE_ALIGN,
409 NULL); 405 NULL);
410 if (nfs_rdata_cachep == NULL) 406 if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 084af1060d79..e4499d5b51e8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
1027 rpc_authflavor_t flavor) 1027 rpc_authflavor_t flavor)
1028{ 1028{
1029 unsigned int i; 1029 unsigned int i;
1030 unsigned int max_flavor_len = (sizeof(auth_info->flavors) / 1030 unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
1031 sizeof(auth_info->flavors[0]));
1032 1031
1033 /* make sure this flavor isn't already in the list */ 1032 /* make sure this flavor isn't already in the list */
1034 for (i = 0; i < auth_info->flavor_len; i++) { 1033 for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2180,7 +2179,7 @@ out_no_address:
2180 return -EINVAL; 2179 return -EINVAL;
2181} 2180}
2182 2181
2183#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ 2182#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
2184 | NFS_MOUNT_SECURE \ 2183 | NFS_MOUNT_SECURE \
2185 | NFS_MOUNT_TCP \ 2184 | NFS_MOUNT_TCP \
2186 | NFS_MOUNT_VER3 \ 2185 | NFS_MOUNT_VER3 \
@@ -2188,15 +2187,16 @@ out_no_address:
2188 | NFS_MOUNT_NONLM \ 2187 | NFS_MOUNT_NONLM \
2189 | NFS_MOUNT_BROKEN_SUID \ 2188 | NFS_MOUNT_BROKEN_SUID \
2190 | NFS_MOUNT_STRICTLOCK \ 2189 | NFS_MOUNT_STRICTLOCK \
2191 | NFS_MOUNT_UNSHARED \
2192 | NFS_MOUNT_NORESVPORT \
2193 | NFS_MOUNT_LEGACY_INTERFACE) 2190 | NFS_MOUNT_LEGACY_INTERFACE)
2194 2191
2192#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
2193 ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
2194
2195static int 2195static int
2196nfs_compare_remount_data(struct nfs_server *nfss, 2196nfs_compare_remount_data(struct nfs_server *nfss,
2197 struct nfs_parsed_mount_data *data) 2197 struct nfs_parsed_mount_data *data)
2198{ 2198{
2199 if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || 2199 if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
2200 data->rsize != nfss->rsize || 2200 data->rsize != nfss->rsize ||
2201 data->wsize != nfss->wsize || 2201 data->wsize != nfss->wsize ||
2202 data->version != nfss->nfs_client->rpc_ops->version || 2202 data->version != nfss->nfs_client->rpc_ops->version ||
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 962c9ee758be..e3b5cf28bdc5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -47,6 +47,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
47static const struct nfs_commit_completion_ops nfs_commit_completion_ops; 47static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
48static const struct nfs_rw_ops nfs_rw_write_ops; 48static const struct nfs_rw_ops nfs_rw_write_ops;
49static void nfs_clear_request_commit(struct nfs_page *req); 49static void nfs_clear_request_commit(struct nfs_page *req);
50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
51 struct inode *inode);
50 52
51static struct kmem_cache *nfs_wdata_cachep; 53static struct kmem_cache *nfs_wdata_cachep;
52static mempool_t *nfs_wdata_mempool; 54static mempool_t *nfs_wdata_mempool;
@@ -71,18 +73,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
71} 73}
72EXPORT_SYMBOL_GPL(nfs_commit_free); 74EXPORT_SYMBOL_GPL(nfs_commit_free);
73 75
74static struct nfs_rw_header *nfs_writehdr_alloc(void) 76static struct nfs_pgio_header *nfs_writehdr_alloc(void)
75{ 77{
76 struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); 78 struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
77 79
78 if (p) 80 if (p)
79 memset(p, 0, sizeof(*p)); 81 memset(p, 0, sizeof(*p));
80 return p; 82 return p;
81} 83}
82 84
83static void nfs_writehdr_free(struct nfs_rw_header *whdr) 85static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
84{ 86{
85 mempool_free(whdr, nfs_wdata_mempool); 87 mempool_free(hdr, nfs_wdata_mempool);
86} 88}
87 89
88static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) 90static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -93,6 +95,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
93} 95}
94 96
95/* 97/*
98 * nfs_page_search_commits_for_head_request_locked
99 *
100 * Search through commit lists on @inode for the head request for @page.
101 * Must be called while holding the inode (which is cinfo) lock.
102 *
103 * Returns the head request if found, or NULL if not found.
104 */
105static struct nfs_page *
106nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
107 struct page *page)
108{
109 struct nfs_page *freq, *t;
110 struct nfs_commit_info cinfo;
111 struct inode *inode = &nfsi->vfs_inode;
112
113 nfs_init_cinfo_from_inode(&cinfo, inode);
114
115 /* search through pnfs commit lists */
116 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
117 if (freq)
118 return freq->wb_head;
119
120 /* Linearly search the commit list for the correct request */
121 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
122 if (freq->wb_page == page)
123 return freq->wb_head;
124 }
125
126 return NULL;
127}
128
129/*
96 * nfs_page_find_head_request_locked - find head request associated with @page 130 * nfs_page_find_head_request_locked - find head request associated with @page
97 * 131 *
98 * must be called while holding the inode lock. 132 * must be called while holding the inode lock.
@@ -106,21 +140,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
106 140
107 if (PagePrivate(page)) 141 if (PagePrivate(page))
108 req = (struct nfs_page *)page_private(page); 142 req = (struct nfs_page *)page_private(page);
109 else if (unlikely(PageSwapCache(page))) { 143 else if (unlikely(PageSwapCache(page)))
110 struct nfs_page *freq, *t; 144 req = nfs_page_search_commits_for_head_request_locked(nfsi,
111 145 page);
112 /* Linearly search the commit list for the correct req */
113 list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
114 if (freq->wb_page == page) {
115 req = freq->wb_head;
116 break;
117 }
118 }
119 }
120 146
121 if (req) { 147 if (req) {
122 WARN_ON_ONCE(req->wb_head != req); 148 WARN_ON_ONCE(req->wb_head != req);
123
124 kref_get(&req->wb_kref); 149 kref_get(&req->wb_kref);
125 } 150 }
126 151
@@ -216,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
216 unsigned int pos = 0; 241 unsigned int pos = 0;
217 unsigned int len = nfs_page_length(req->wb_page); 242 unsigned int len = nfs_page_length(req->wb_page);
218 243
219 nfs_page_group_lock(req); 244 nfs_page_group_lock(req, true);
220 245
221 do { 246 do {
222 tmp = nfs_page_group_search_locked(req->wb_head, pos); 247 tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -379,8 +404,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
379 subreq->wb_head = subreq; 404 subreq->wb_head = subreq;
380 subreq->wb_this_page = subreq; 405 subreq->wb_this_page = subreq;
381 406
382 nfs_clear_request_commit(subreq);
383
384 /* subreq is now totally disconnected from page group or any 407 /* subreq is now totally disconnected from page group or any
385 * write / commit lists. last chance to wake any waiters */ 408 * write / commit lists. last chance to wake any waiters */
386 nfs_unlock_request(subreq); 409 nfs_unlock_request(subreq);
@@ -456,7 +479,9 @@ try_again:
456 } 479 }
457 480
458 /* lock each request in the page group */ 481 /* lock each request in the page group */
459 nfs_page_group_lock(head); 482 ret = nfs_page_group_lock(head, false);
483 if (ret < 0)
484 return ERR_PTR(ret);
460 subreq = head; 485 subreq = head;
461 do { 486 do {
462 /* 487 /*
@@ -488,7 +513,7 @@ try_again:
488 * Commit list removal accounting is done after locks are dropped */ 513 * Commit list removal accounting is done after locks are dropped */
489 subreq = head; 514 subreq = head;
490 do { 515 do {
491 nfs_list_remove_request(subreq); 516 nfs_clear_request_commit(subreq);
492 subreq = subreq->wb_this_page; 517 subreq = subreq->wb_this_page;
493 } while (subreq != head); 518 } while (subreq != head);
494 519
@@ -518,15 +543,11 @@ try_again:
518 543
519 nfs_page_group_unlock(head); 544 nfs_page_group_unlock(head);
520 545
521 /* drop lock to clear_request_commit the head req and clean up 546 /* drop lock to clean uprequests on destroy list */
522 * requests on destroy list */
523 spin_unlock(&inode->i_lock); 547 spin_unlock(&inode->i_lock);
524 548
525 nfs_destroy_unlinked_subrequests(destroy_list, head); 549 nfs_destroy_unlinked_subrequests(destroy_list, head);
526 550
527 /* clean up commit list state */
528 nfs_clear_request_commit(head);
529
530 /* still holds ref on head from nfs_page_find_head_request_locked 551 /* still holds ref on head from nfs_page_find_head_request_locked
531 * and still has lock on head from lock loop */ 552 * and still has lock on head from lock loop */
532 return head; 553 return head;
@@ -705,6 +726,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
705 726
706 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) 727 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
707 nfs_release_request(req); 728 nfs_release_request(req);
729 else
730 WARN_ON_ONCE(1);
708} 731}
709 732
710static void 733static void
@@ -808,6 +831,7 @@ nfs_clear_page_commit(struct page *page)
808 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); 831 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
809} 832}
810 833
834/* Called holding inode (/cinfo) lock */
811static void 835static void
812nfs_clear_request_commit(struct nfs_page *req) 836nfs_clear_request_commit(struct nfs_page *req)
813{ 837{
@@ -817,20 +841,17 @@ nfs_clear_request_commit(struct nfs_page *req)
817 841
818 nfs_init_cinfo_from_inode(&cinfo, inode); 842 nfs_init_cinfo_from_inode(&cinfo, inode);
819 if (!pnfs_clear_request_commit(req, &cinfo)) { 843 if (!pnfs_clear_request_commit(req, &cinfo)) {
820 spin_lock(cinfo.lock);
821 nfs_request_remove_commit_list(req, &cinfo); 844 nfs_request_remove_commit_list(req, &cinfo);
822 spin_unlock(cinfo.lock);
823 } 845 }
824 nfs_clear_page_commit(req->wb_page); 846 nfs_clear_page_commit(req->wb_page);
825 } 847 }
826} 848}
827 849
828static inline 850int nfs_write_need_commit(struct nfs_pgio_header *hdr)
829int nfs_write_need_commit(struct nfs_pgio_data *data)
830{ 851{
831 if (data->verf.committed == NFS_DATA_SYNC) 852 if (hdr->verf.committed == NFS_DATA_SYNC)
832 return data->header->lseg == NULL; 853 return hdr->lseg == NULL;
833 return data->verf.committed != NFS_FILE_SYNC; 854 return hdr->verf.committed != NFS_FILE_SYNC;
834} 855}
835 856
836#else 857#else
@@ -856,8 +877,7 @@ nfs_clear_request_commit(struct nfs_page *req)
856{ 877{
857} 878}
858 879
859static inline 880int nfs_write_need_commit(struct nfs_pgio_header *hdr)
860int nfs_write_need_commit(struct nfs_pgio_data *data)
861{ 881{
862 return 0; 882 return 0;
863} 883}
@@ -883,11 +903,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
883 nfs_context_set_write_error(req->wb_context, hdr->error); 903 nfs_context_set_write_error(req->wb_context, hdr->error);
884 goto remove_req; 904 goto remove_req;
885 } 905 }
886 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { 906 if (nfs_write_need_commit(hdr)) {
887 nfs_mark_request_dirty(req);
888 goto next;
889 }
890 if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
891 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); 907 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
892 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 908 nfs_mark_request_commit(req, hdr->lseg, &cinfo);
893 goto next; 909 goto next;
@@ -1038,9 +1054,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
1038 else 1054 else
1039 req->wb_bytes = rqend - req->wb_offset; 1055 req->wb_bytes = rqend - req->wb_offset;
1040out_unlock: 1056out_unlock:
1041 spin_unlock(&inode->i_lock);
1042 if (req) 1057 if (req)
1043 nfs_clear_request_commit(req); 1058 nfs_clear_request_commit(req);
1059 spin_unlock(&inode->i_lock);
1044 return req; 1060 return req;
1045out_flushme: 1061out_flushme:
1046 spin_unlock(&inode->i_lock); 1062 spin_unlock(&inode->i_lock);
@@ -1241,17 +1257,18 @@ static int flush_task_priority(int how)
1241 return RPC_PRIORITY_NORMAL; 1257 return RPC_PRIORITY_NORMAL;
1242} 1258}
1243 1259
1244static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, 1260static void nfs_initiate_write(struct nfs_pgio_header *hdr,
1261 struct rpc_message *msg,
1245 struct rpc_task_setup *task_setup_data, int how) 1262 struct rpc_task_setup *task_setup_data, int how)
1246{ 1263{
1247 struct inode *inode = data->header->inode; 1264 struct inode *inode = hdr->inode;
1248 int priority = flush_task_priority(how); 1265 int priority = flush_task_priority(how);
1249 1266
1250 task_setup_data->priority = priority; 1267 task_setup_data->priority = priority;
1251 NFS_PROTO(inode)->write_setup(data, msg); 1268 NFS_PROTO(inode)->write_setup(hdr, msg);
1252 1269
1253 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, 1270 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
1254 &task_setup_data->rpc_client, msg, data); 1271 &task_setup_data->rpc_client, msg, hdr);
1255} 1272}
1256 1273
1257/* If a nfs_flush_* function fails, it should remove reqs from @head and 1274/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1330,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
1313 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); 1330 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
1314} 1331}
1315 1332
1316static void nfs_writeback_release_common(struct nfs_pgio_data *data) 1333static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
1317{ 1334{
1318 struct nfs_pgio_header *hdr = data->header; 1335 /* do nothing! */
1319 int status = data->task.tk_status;
1320
1321 if ((status >= 0) && nfs_write_need_commit(data)) {
1322 spin_lock(&hdr->lock);
1323 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
1324 ; /* Do nothing */
1325 else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
1326 memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
1327 else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
1328 set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
1329 spin_unlock(&hdr->lock);
1330 }
1331} 1336}
1332 1337
1333/* 1338/*
@@ -1358,7 +1363,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
1358/* 1363/*
1359 * This function is called when the WRITE call is complete. 1364 * This function is called when the WRITE call is complete.
1360 */ 1365 */
1361static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, 1366static int nfs_writeback_done(struct rpc_task *task,
1367 struct nfs_pgio_header *hdr,
1362 struct inode *inode) 1368 struct inode *inode)
1363{ 1369{
1364 int status; 1370 int status;
@@ -1370,13 +1376,14 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1370 * another writer had changed the file, but some applications 1376 * another writer had changed the file, but some applications
1371 * depend on tighter cache coherency when writing. 1377 * depend on tighter cache coherency when writing.
1372 */ 1378 */
1373 status = NFS_PROTO(inode)->write_done(task, data); 1379 status = NFS_PROTO(inode)->write_done(task, hdr);
1374 if (status != 0) 1380 if (status != 0)
1375 return status; 1381 return status;
1376 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); 1382 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
1377 1383
1378#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 1384#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1379 if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { 1385 if (hdr->res.verf->committed < hdr->args.stable &&
1386 task->tk_status >= 0) {
1380 /* We tried a write call, but the server did not 1387 /* We tried a write call, but the server did not
1381 * commit data to stable storage even though we 1388 * commit data to stable storage even though we
1382 * requested it. 1389 * requested it.
@@ -1392,7 +1399,7 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1392 dprintk("NFS: faulty NFS server %s:" 1399 dprintk("NFS: faulty NFS server %s:"
1393 " (committed = %d) != (stable = %d)\n", 1400 " (committed = %d) != (stable = %d)\n",
1394 NFS_SERVER(inode)->nfs_client->cl_hostname, 1401 NFS_SERVER(inode)->nfs_client->cl_hostname,
1395 data->res.verf->committed, data->args.stable); 1402 hdr->res.verf->committed, hdr->args.stable);
1396 complain = jiffies + 300 * HZ; 1403 complain = jiffies + 300 * HZ;
1397 } 1404 }
1398 } 1405 }
@@ -1407,16 +1414,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1407/* 1414/*
1408 * This function is called when the WRITE call is complete. 1415 * This function is called when the WRITE call is complete.
1409 */ 1416 */
1410static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) 1417static void nfs_writeback_result(struct rpc_task *task,
1418 struct nfs_pgio_header *hdr)
1411{ 1419{
1412 struct nfs_pgio_args *argp = &data->args; 1420 struct nfs_pgio_args *argp = &hdr->args;
1413 struct nfs_pgio_res *resp = &data->res; 1421 struct nfs_pgio_res *resp = &hdr->res;
1414 1422
1415 if (resp->count < argp->count) { 1423 if (resp->count < argp->count) {
1416 static unsigned long complain; 1424 static unsigned long complain;
1417 1425
1418 /* This a short write! */ 1426 /* This a short write! */
1419 nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); 1427 nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
1420 1428
1421 /* Has the server at least made some progress? */ 1429 /* Has the server at least made some progress? */
1422 if (resp->count == 0) { 1430 if (resp->count == 0) {
@@ -1426,14 +1434,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
1426 argp->count); 1434 argp->count);
1427 complain = jiffies + 300 * HZ; 1435 complain = jiffies + 300 * HZ;
1428 } 1436 }
1429 nfs_set_pgio_error(data->header, -EIO, argp->offset); 1437 nfs_set_pgio_error(hdr, -EIO, argp->offset);
1430 task->tk_status = -EIO; 1438 task->tk_status = -EIO;
1431 return; 1439 return;
1432 } 1440 }
1433 /* Was this an NFSv2 write or an NFSv3 stable write? */ 1441 /* Was this an NFSv2 write or an NFSv3 stable write? */
1434 if (resp->verf->committed != NFS_UNSTABLE) { 1442 if (resp->verf->committed != NFS_UNSTABLE) {
1435 /* Resend from where the server left off */ 1443 /* Resend from where the server left off */
1436 data->mds_offset += resp->count; 1444 hdr->mds_offset += resp->count;
1437 argp->offset += resp->count; 1445 argp->offset += resp->count;
1438 argp->pgbase += resp->count; 1446 argp->pgbase += resp->count;
1439 argp->count -= resp->count; 1447 argp->count -= resp->count;
@@ -1884,7 +1892,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1884int __init nfs_init_writepagecache(void) 1892int __init nfs_init_writepagecache(void)
1885{ 1893{
1886 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1894 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
1887 sizeof(struct nfs_rw_header), 1895 sizeof(struct nfs_pgio_header),
1888 0, SLAB_HWCACHE_ALIGN, 1896 0, SLAB_HWCACHE_ALIGN,
1889 NULL); 1897 NULL);
1890 if (nfs_wdata_cachep == NULL) 1898 if (nfs_wdata_cachep == NULL)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index ed628f71274c..538f142935ea 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -30,9 +30,6 @@
30 30
31MODULE_LICENSE("GPL"); 31MODULE_LICENSE("GPL");
32 32
33EXPORT_SYMBOL_GPL(nfsacl_encode);
34EXPORT_SYMBOL_GPL(nfsacl_decode);
35
36struct nfsacl_encode_desc { 33struct nfsacl_encode_desc {
37 struct xdr_array2_desc desc; 34 struct xdr_array2_desc desc;
38 unsigned int count; 35 unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
136 nfsacl_desc.desc.array_len; 133 nfsacl_desc.desc.array_len;
137 return err; 134 return err;
138} 135}
136EXPORT_SYMBOL_GPL(nfsacl_encode);
139 137
140struct nfsacl_decode_desc { 138struct nfsacl_decode_desc {
141 struct xdr_array2_desc desc; 139 struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
295 return 8 + nfsacl_desc.desc.elem_size * 293 return 8 + nfsacl_desc.desc.elem_size *
296 nfsacl_desc.desc.array_len; 294 nfsacl_desc.desc.array_len;
297} 295}
296EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e30f6059ecd6..5180a7ededec 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -52,6 +52,7 @@ struct nfs_access_entry {
52 unsigned long jiffies; 52 unsigned long jiffies;
53 struct rpc_cred * cred; 53 struct rpc_cred * cred;
54 int mask; 54 int mask;
55 struct rcu_head rcu_head;
55}; 56};
56 57
57struct nfs_lockowner { 58struct nfs_lockowner {
@@ -352,6 +353,7 @@ extern int nfs_release(struct inode *, struct file *);
352extern int nfs_attribute_timeout(struct inode *inode); 353extern int nfs_attribute_timeout(struct inode *inode);
353extern int nfs_attribute_cache_expired(struct inode *inode); 354extern int nfs_attribute_cache_expired(struct inode *inode);
354extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); 355extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
356extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
355extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); 357extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
356extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); 358extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
357extern int nfs_setattr(struct dentry *, struct iattr *); 359extern int nfs_setattr(struct dentry *, struct iattr *);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1150ea41b626..922be2e050f5 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -45,6 +45,7 @@ struct nfs_client {
45 struct sockaddr_storage cl_addr; /* server identifier */ 45 struct sockaddr_storage cl_addr; /* server identifier */
46 size_t cl_addrlen; 46 size_t cl_addrlen;
47 char * cl_hostname; /* hostname of server */ 47 char * cl_hostname; /* hostname of server */
48 char * cl_acceptor; /* GSSAPI acceptor name */
48 struct list_head cl_share_link; /* link in global client list */ 49 struct list_head cl_share_link; /* link in global client list */
49 struct list_head cl_superblocks; /* List of nfs_server structs */ 50 struct list_head cl_superblocks; /* List of nfs_server structs */
50 51
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 7d9096d95d4a..6ad2bbcad405 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -26,7 +26,7 @@ enum {
26 PG_MAPPED, /* page private set for buffered io */ 26 PG_MAPPED, /* page private set for buffered io */
27 PG_CLEAN, /* write succeeded */ 27 PG_CLEAN, /* write succeeded */
28 PG_COMMIT_TO_DS, /* used by pnfs layouts */ 28 PG_COMMIT_TO_DS, /* used by pnfs layouts */
29 PG_INODE_REF, /* extra ref held by inode (head req only) */ 29 PG_INODE_REF, /* extra ref held by inode when in writeback */
30 PG_HEADLOCK, /* page group lock of wb_head */ 30 PG_HEADLOCK, /* page group lock of wb_head */
31 PG_TEARDOWN, /* page group sync for destroy */ 31 PG_TEARDOWN, /* page group sync for destroy */
32 PG_UNLOCKPAGE, /* page group sync bit in read path */ 32 PG_UNLOCKPAGE, /* page group sync bit in read path */
@@ -62,12 +62,13 @@ struct nfs_pageio_ops {
62 62
63struct nfs_rw_ops { 63struct nfs_rw_ops {
64 const fmode_t rw_mode; 64 const fmode_t rw_mode;
65 struct nfs_rw_header *(*rw_alloc_header)(void); 65 struct nfs_pgio_header *(*rw_alloc_header)(void);
66 void (*rw_free_header)(struct nfs_rw_header *); 66 void (*rw_free_header)(struct nfs_pgio_header *);
67 void (*rw_release)(struct nfs_pgio_data *); 67 void (*rw_release)(struct nfs_pgio_header *);
68 int (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *); 68 int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
69 void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *); 69 struct inode *);
70 void (*rw_initiate)(struct nfs_pgio_data *, struct rpc_message *, 70 void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
71 void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *,
71 struct rpc_task_setup *, int); 72 struct rpc_task_setup *, int);
72}; 73};
73 74
@@ -111,6 +112,8 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
111 int how); 112 int how);
112extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, 113extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
113 struct nfs_page *); 114 struct nfs_page *);
115extern int nfs_pageio_resend(struct nfs_pageio_descriptor *,
116 struct nfs_pgio_header *);
114extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc); 117extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
115extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t); 118extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
116extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, 119extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
@@ -119,7 +122,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
119extern int nfs_wait_on_request(struct nfs_page *); 122extern int nfs_wait_on_request(struct nfs_page *);
120extern void nfs_unlock_request(struct nfs_page *req); 123extern void nfs_unlock_request(struct nfs_page *req);
121extern void nfs_unlock_and_release_request(struct nfs_page *); 124extern void nfs_unlock_and_release_request(struct nfs_page *);
122extern void nfs_page_group_lock(struct nfs_page *); 125extern int nfs_page_group_lock(struct nfs_page *, bool);
123extern void nfs_page_group_unlock(struct nfs_page *); 126extern void nfs_page_group_unlock(struct nfs_page *);
124extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); 127extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
125 128
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9a1396e70310..0040629894df 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -993,6 +993,7 @@ struct nfs4_setclientid {
993 unsigned int sc_uaddr_len; 993 unsigned int sc_uaddr_len;
994 char sc_uaddr[RPCBIND_MAXUADDRLEN + 1]; 994 char sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
995 u32 sc_cb_ident; 995 u32 sc_cb_ident;
996 struct rpc_cred *sc_cred;
996}; 997};
997 998
998struct nfs4_setclientid_res { 999struct nfs4_setclientid_res {
@@ -1253,18 +1254,12 @@ enum {
1253 NFS_IOHDR_ERROR = 0, 1254 NFS_IOHDR_ERROR = 0,
1254 NFS_IOHDR_EOF, 1255 NFS_IOHDR_EOF,
1255 NFS_IOHDR_REDO, 1256 NFS_IOHDR_REDO,
1256 NFS_IOHDR_NEED_COMMIT,
1257 NFS_IOHDR_NEED_RESCHED,
1258}; 1257};
1259 1258
1260struct nfs_pgio_data;
1261
1262struct nfs_pgio_header { 1259struct nfs_pgio_header {
1263 struct inode *inode; 1260 struct inode *inode;
1264 struct rpc_cred *cred; 1261 struct rpc_cred *cred;
1265 struct list_head pages; 1262 struct list_head pages;
1266 struct nfs_pgio_data *data;
1267 atomic_t refcnt;
1268 struct nfs_page *req; 1263 struct nfs_page *req;
1269 struct nfs_writeverf verf; /* Used for writes */ 1264 struct nfs_writeverf verf; /* Used for writes */
1270 struct pnfs_layout_segment *lseg; 1265 struct pnfs_layout_segment *lseg;
@@ -1281,28 +1276,22 @@ struct nfs_pgio_header {
1281 int error; /* merge with pnfs_error */ 1276 int error; /* merge with pnfs_error */
1282 unsigned long good_bytes; /* boundary of good data */ 1277 unsigned long good_bytes; /* boundary of good data */
1283 unsigned long flags; 1278 unsigned long flags;
1284};
1285 1279
1286struct nfs_pgio_data { 1280 /*
1287 struct nfs_pgio_header *header; 1281 * rpc data
1282 */
1288 struct rpc_task task; 1283 struct rpc_task task;
1289 struct nfs_fattr fattr; 1284 struct nfs_fattr fattr;
1290 struct nfs_writeverf verf; /* Used for writes */
1291 struct nfs_pgio_args args; /* argument struct */ 1285 struct nfs_pgio_args args; /* argument struct */
1292 struct nfs_pgio_res res; /* result struct */ 1286 struct nfs_pgio_res res; /* result struct */
1293 unsigned long timestamp; /* For lease renewal */ 1287 unsigned long timestamp; /* For lease renewal */
1294 int (*pgio_done_cb) (struct rpc_task *task, struct nfs_pgio_data *data); 1288 int (*pgio_done_cb)(struct rpc_task *, struct nfs_pgio_header *);
1295 __u64 mds_offset; /* Filelayout dense stripe */ 1289 __u64 mds_offset; /* Filelayout dense stripe */
1296 struct nfs_page_array pages; 1290 struct nfs_page_array page_array;
1297 struct nfs_client *ds_clp; /* pNFS data server */ 1291 struct nfs_client *ds_clp; /* pNFS data server */
1298 int ds_idx; /* ds index if ds_clp is set */ 1292 int ds_idx; /* ds index if ds_clp is set */
1299}; 1293};
1300 1294
1301struct nfs_rw_header {
1302 struct nfs_pgio_header header;
1303 struct nfs_pgio_data rpc_data;
1304};
1305
1306struct nfs_mds_commit_info { 1295struct nfs_mds_commit_info {
1307 atomic_t rpcs_out; 1296 atomic_t rpcs_out;
1308 unsigned long ncommit; 1297 unsigned long ncommit;
@@ -1432,11 +1421,12 @@ struct nfs_rpc_ops {
1432 struct nfs_pathconf *); 1421 struct nfs_pathconf *);
1433 int (*set_capabilities)(struct nfs_server *, struct nfs_fh *); 1422 int (*set_capabilities)(struct nfs_server *, struct nfs_fh *);
1434 int (*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int); 1423 int (*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
1435 int (*pgio_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *); 1424 int (*pgio_rpc_prepare)(struct rpc_task *,
1436 void (*read_setup) (struct nfs_pgio_data *, struct rpc_message *); 1425 struct nfs_pgio_header *);
1437 int (*read_done) (struct rpc_task *, struct nfs_pgio_data *); 1426 void (*read_setup)(struct nfs_pgio_header *, struct rpc_message *);
1438 void (*write_setup) (struct nfs_pgio_data *, struct rpc_message *); 1427 int (*read_done)(struct rpc_task *, struct nfs_pgio_header *);
1439 int (*write_done) (struct rpc_task *, struct nfs_pgio_data *); 1428 void (*write_setup)(struct nfs_pgio_header *, struct rpc_message *);
1429 int (*write_done)(struct rpc_task *, struct nfs_pgio_header *);
1440 void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *); 1430 void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
1441 void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *); 1431 void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
1442 int (*commit_done) (struct rpc_task *, struct nfs_commit_data *); 1432 int (*commit_done) (struct rpc_task *, struct nfs_commit_data *);
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 790be1472792..8e030075fe79 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -103,6 +103,7 @@ struct rpc_auth_create_args {
103 103
104/* Flags for rpcauth_lookupcred() */ 104/* Flags for rpcauth_lookupcred() */
105#define RPCAUTH_LOOKUP_NEW 0x01 /* Accept an uninitialised cred */ 105#define RPCAUTH_LOOKUP_NEW 0x01 /* Accept an uninitialised cred */
106#define RPCAUTH_LOOKUP_RCU 0x02 /* lock-less lookup */
106 107
107/* 108/*
108 * Client authentication ops 109 * Client authentication ops
@@ -140,6 +141,7 @@ struct rpc_credops {
140 void *, __be32 *, void *); 141 void *, __be32 *, void *);
141 int (*crkey_timeout)(struct rpc_cred *); 142 int (*crkey_timeout)(struct rpc_cred *);
142 bool (*crkey_to_expire)(struct rpc_cred *); 143 bool (*crkey_to_expire)(struct rpc_cred *);
144 char * (*crstringify_acceptor)(struct rpc_cred *);
143}; 145};
144 146
145extern const struct rpc_authops authunix_ops; 147extern const struct rpc_authops authunix_ops;
@@ -153,6 +155,7 @@ void rpc_destroy_generic_auth(void);
153void rpc_destroy_authunix(void); 155void rpc_destroy_authunix(void);
154 156
155struct rpc_cred * rpc_lookup_cred(void); 157struct rpc_cred * rpc_lookup_cred(void);
158struct rpc_cred * rpc_lookup_cred_nonblock(void);
156struct rpc_cred * rpc_lookup_machine_cred(const char *service_name); 159struct rpc_cred * rpc_lookup_machine_cred(const char *service_name);
157int rpcauth_register(const struct rpc_authops *); 160int rpcauth_register(const struct rpc_authops *);
158int rpcauth_unregister(const struct rpc_authops *); 161int rpcauth_unregister(const struct rpc_authops *);
@@ -182,6 +185,7 @@ void rpcauth_clear_credcache(struct rpc_cred_cache *);
182int rpcauth_key_timeout_notify(struct rpc_auth *, 185int rpcauth_key_timeout_notify(struct rpc_auth *,
183 struct rpc_cred *); 186 struct rpc_cred *);
184bool rpcauth_cred_key_to_expire(struct rpc_cred *); 187bool rpcauth_cred_key_to_expire(struct rpc_cred *);
188char * rpcauth_stringify_acceptor(struct rpc_cred *);
185 189
186static inline 190static inline
187struct rpc_cred * get_rpccred(struct rpc_cred *cred) 191struct rpc_cred * get_rpccred(struct rpc_cred *cred)
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index f1cfd4c85cd0..36eebc451b41 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,8 +69,9 @@ struct gss_cl_ctx {
69 enum rpc_gss_proc gc_proc; 69 enum rpc_gss_proc gc_proc;
70 u32 gc_seq; 70 u32 gc_seq;
71 spinlock_t gc_seq_lock; 71 spinlock_t gc_seq_lock;
72 struct gss_ctx __rcu *gc_gss_ctx; 72 struct gss_ctx *gc_gss_ctx;
73 struct xdr_netobj gc_wire_ctx; 73 struct xdr_netobj gc_wire_ctx;
74 struct xdr_netobj gc_acceptor;
74 u32 gc_win; 75 u32 gc_win;
75 unsigned long gc_expiry; 76 unsigned long gc_expiry;
76 struct rcu_head gc_rcu; 77 struct rcu_head gc_rcu;
diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h
index 5af2931cf58d..df02a4188487 100644
--- a/include/linux/sunrpc/gss_krb5.h
+++ b/include/linux/sunrpc/gss_krb5.h
@@ -81,7 +81,7 @@ struct gss_krb5_enctype {
81 struct xdr_netobj *in, 81 struct xdr_netobj *in,
82 struct xdr_netobj *out); /* complete key generation */ 82 struct xdr_netobj *out); /* complete key generation */
83 u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset, 83 u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset,
84 struct xdr_buf *buf, int ec, 84 struct xdr_buf *buf,
85 struct page **pages); /* v2 encryption function */ 85 struct page **pages); /* v2 encryption function */
86 u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset, 86 u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset,
87 struct xdr_buf *buf, u32 *headskip, 87 struct xdr_buf *buf, u32 *headskip,
@@ -310,7 +310,7 @@ gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
310 310
311u32 311u32
312gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, 312gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
313 struct xdr_buf *buf, int ec, 313 struct xdr_buf *buf,
314 struct page **pages); 314 struct page **pages);
315 315
316u32 316u32
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index c2f04e1ae159..64a0a0a97b23 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -62,8 +62,6 @@
62#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ 62#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */
63 63
64/* memory registration strategies */ 64/* memory registration strategies */
65#define RPCRDMA_PERSISTENT_REGISTRATION (1)
66
67enum rpcrdma_memreg { 65enum rpcrdma_memreg {
68 RPCRDMA_BOUNCEBUFFERS = 0, 66 RPCRDMA_BOUNCEBUFFERS = 0,
69 RPCRDMA_REGISTER, 67 RPCRDMA_REGISTER,
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index a622ad64acd8..2e0a6f92e563 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
176 len = (buf + buflen) - delim - 1; 176 len = (buf + buflen) - delim - 1;
177 p = kstrndup(delim + 1, len, GFP_KERNEL); 177 p = kstrndup(delim + 1, len, GFP_KERNEL);
178 if (p) { 178 if (p) {
179 unsigned long scope_id = 0; 179 u32 scope_id = 0;
180 struct net_device *dev; 180 struct net_device *dev;
181 181
182 dev = dev_get_by_name(net, p); 182 dev = dev_get_by_name(net, p);
@@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
184 scope_id = dev->ifindex; 184 scope_id = dev->ifindex;
185 dev_put(dev); 185 dev_put(dev);
186 } else { 186 } else {
187 if (strict_strtoul(p, 10, &scope_id) == 0) { 187 if (kstrtou32(p, 10, &scope_id) == 0) {
188 kfree(p); 188 kfree(p);
189 return 0; 189 return 0;
190 } 190 }
@@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
304 * @sap: buffer into which to plant socket address 304 * @sap: buffer into which to plant socket address
305 * @salen: size of buffer 305 * @salen: size of buffer
306 * 306 *
307 * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and 307 * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
308 * rpc_pton() require proper string termination to be successful. 308 * rpc_pton() require proper string termination to be successful.
309 * 309 *
310 * Returns the size of the socket address if successful; otherwise 310 * Returns the size of the socket address if successful; otherwise
@@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
315 const size_t salen) 315 const size_t salen)
316{ 316{
317 char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')]; 317 char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
318 unsigned long portlo, porthi; 318 u8 portlo, porthi;
319 unsigned short port; 319 unsigned short port;
320 320
321 if (uaddr_len > RPCBIND_MAXUADDRLEN) 321 if (uaddr_len > RPCBIND_MAXUADDRLEN)
@@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
327 c = strrchr(buf, '.'); 327 c = strrchr(buf, '.');
328 if (unlikely(c == NULL)) 328 if (unlikely(c == NULL))
329 return 0; 329 return 0;
330 if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0)) 330 if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
331 return 0;
332 if (unlikely(portlo > 255))
333 return 0; 331 return 0;
334 332
335 *c = '\0'; 333 *c = '\0';
336 c = strrchr(buf, '.'); 334 c = strrchr(buf, '.');
337 if (unlikely(c == NULL)) 335 if (unlikely(c == NULL))
338 return 0; 336 return 0;
339 if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0)) 337 if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
340 return 0;
341 if (unlikely(porthi > 255))
342 return 0; 338 return 0;
343 339
344 port = (unsigned short)((porthi << 8) | portlo); 340 port = (unsigned short)((porthi << 8) | portlo);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f77366717420..383eb919ac0b 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
48 48
49 if (!val) 49 if (!val)
50 goto out_inval; 50 goto out_inval;
51 ret = strict_strtoul(val, 0, &num); 51 ret = kstrtoul(val, 0, &num);
52 if (ret == -EINVAL) 52 if (ret == -EINVAL)
53 goto out_inval; 53 goto out_inval;
54 nbits = fls(num); 54 nbits = fls(num);
@@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = {
80module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); 80module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
81MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); 81MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
82 82
83static unsigned long auth_max_cred_cachesize = ULONG_MAX;
84module_param(auth_max_cred_cachesize, ulong, 0644);
85MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
86
83static u32 87static u32
84pseudoflavor_to_flavor(u32 flavor) { 88pseudoflavor_to_flavor(u32 flavor) {
85 if (flavor > RPC_AUTH_MAXFLAVOR) 89 if (flavor > RPC_AUTH_MAXFLAVOR)
@@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred)
363} 367}
364EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire); 368EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
365 369
370char *
371rpcauth_stringify_acceptor(struct rpc_cred *cred)
372{
373 if (!cred->cr_ops->crstringify_acceptor)
374 return NULL;
375 return cred->cr_ops->crstringify_acceptor(cred);
376}
377EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
378
366/* 379/*
367 * Destroy a list of credentials 380 * Destroy a list of credentials
368 */ 381 */
@@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
472 return freed; 485 return freed;
473} 486}
474 487
488static unsigned long
489rpcauth_cache_do_shrink(int nr_to_scan)
490{
491 LIST_HEAD(free);
492 unsigned long freed;
493
494 spin_lock(&rpc_credcache_lock);
495 freed = rpcauth_prune_expired(&free, nr_to_scan);
496 spin_unlock(&rpc_credcache_lock);
497 rpcauth_destroy_credlist(&free);
498
499 return freed;
500}
501
475/* 502/*
476 * Run memory cache shrinker. 503 * Run memory cache shrinker.
477 */ 504 */
@@ -479,9 +506,6 @@ static unsigned long
479rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 506rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
480 507
481{ 508{
482 LIST_HEAD(free);
483 unsigned long freed;
484
485 if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) 509 if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
486 return SHRINK_STOP; 510 return SHRINK_STOP;
487 511
@@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
489 if (list_empty(&cred_unused)) 513 if (list_empty(&cred_unused))
490 return SHRINK_STOP; 514 return SHRINK_STOP;
491 515
492 spin_lock(&rpc_credcache_lock); 516 return rpcauth_cache_do_shrink(sc->nr_to_scan);
493 freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
494 spin_unlock(&rpc_credcache_lock);
495 rpcauth_destroy_credlist(&free);
496
497 return freed;
498} 517}
499 518
500static unsigned long 519static unsigned long
@@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
504 return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; 523 return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
505} 524}
506 525
526static void
527rpcauth_cache_enforce_limit(void)
528{
529 unsigned long diff;
530 unsigned int nr_to_scan;
531
532 if (number_cred_unused <= auth_max_cred_cachesize)
533 return;
534 diff = number_cred_unused - auth_max_cred_cachesize;
535 nr_to_scan = 100;
536 if (diff < nr_to_scan)
537 nr_to_scan = diff;
538 rpcauth_cache_do_shrink(nr_to_scan);
539}
540
507/* 541/*
508 * Look up a process' credentials in the authentication cache 542 * Look up a process' credentials in the authentication cache
509 */ 543 */
@@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
523 hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { 557 hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
524 if (!entry->cr_ops->crmatch(acred, entry, flags)) 558 if (!entry->cr_ops->crmatch(acred, entry, flags))
525 continue; 559 continue;
560 if (flags & RPCAUTH_LOOKUP_RCU) {
561 if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
562 !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
563 cred = entry;
564 break;
565 }
526 spin_lock(&cache->lock); 566 spin_lock(&cache->lock);
527 if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { 567 if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
528 spin_unlock(&cache->lock); 568 spin_unlock(&cache->lock);
@@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
537 if (cred != NULL) 577 if (cred != NULL)
538 goto found; 578 goto found;
539 579
580 if (flags & RPCAUTH_LOOKUP_RCU)
581 return ERR_PTR(-ECHILD);
582
540 new = auth->au_ops->crcreate(auth, acred, flags); 583 new = auth->au_ops->crcreate(auth, acred, flags);
541 if (IS_ERR(new)) { 584 if (IS_ERR(new)) {
542 cred = new; 585 cred = new;
@@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
557 } else 600 } else
558 list_add_tail(&new->cr_lru, &free); 601 list_add_tail(&new->cr_lru, &free);
559 spin_unlock(&cache->lock); 602 spin_unlock(&cache->lock);
603 rpcauth_cache_enforce_limit();
560found: 604found:
561 if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && 605 if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
562 cred->cr_ops->cr_init != NULL && 606 cred->cr_ops->cr_init != NULL &&
@@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
586 memset(&acred, 0, sizeof(acred)); 630 memset(&acred, 0, sizeof(acred));
587 acred.uid = cred->fsuid; 631 acred.uid = cred->fsuid;
588 acred.gid = cred->fsgid; 632 acred.gid = cred->fsgid;
589 acred.group_info = get_group_info(((struct cred *)cred)->group_info); 633 acred.group_info = cred->group_info;
590
591 ret = auth->au_ops->lookup_cred(auth, &acred, flags); 634 ret = auth->au_ops->lookup_cred(auth, &acred, flags);
592 put_group_info(acred.group_info);
593 return ret; 635 return ret;
594} 636}
595EXPORT_SYMBOL_GPL(rpcauth_lookupcred); 637EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index ed04869b2d4f..6f6b829c9e8e 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
38} 38}
39EXPORT_SYMBOL_GPL(rpc_lookup_cred); 39EXPORT_SYMBOL_GPL(rpc_lookup_cred);
40 40
41struct rpc_cred *rpc_lookup_cred_nonblock(void)
42{
43 return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
44}
45EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
46
41/* 47/*
42 * Public call interface for looking up machine creds. 48 * Public call interface for looking up machine creds.
43 */ 49 */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index b6e440baccc3..afb292cd797d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred)
183 struct gss_cl_ctx *ctx = NULL; 183 struct gss_cl_ctx *ctx = NULL;
184 184
185 rcu_read_lock(); 185 rcu_read_lock();
186 if (gss_cred->gc_ctx) 186 ctx = rcu_dereference(gss_cred->gc_ctx);
187 ctx = gss_get_ctx(gss_cred->gc_ctx); 187 if (ctx)
188 gss_get_ctx(ctx);
188 rcu_read_unlock(); 189 rcu_read_unlock();
189 return ctx; 190 return ctx;
190} 191}
@@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
262 p = ERR_PTR(ret); 263 p = ERR_PTR(ret);
263 goto err; 264 goto err;
264 } 265 }
265 dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u\n", 266
266 __func__, ctx->gc_expiry, now, timeout); 267 /* is there any trailing data? */
267 return q; 268 if (q == end) {
269 p = q;
270 goto done;
271 }
272
273 /* pull in acceptor name (if there is one) */
274 p = simple_get_netobj(q, end, &ctx->gc_acceptor);
275 if (IS_ERR(p))
276 goto err;
277done:
278 dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
279 __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
280 ctx->gc_acceptor.data);
281 return p;
268err: 282err:
269 dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p)); 283 dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p));
270 return p; 284 return p;
@@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred)
1194{ 1208{
1195 struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); 1209 struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
1196 struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); 1210 struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
1211 struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
1197 struct rpc_task *task; 1212 struct rpc_task *task;
1198 1213
1199 if (gss_cred->gc_ctx == NULL || 1214 if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
1200 test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
1201 return 0; 1215 return 0;
1202 1216
1203 gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY; 1217 ctx->gc_proc = RPC_GSS_PROC_DESTROY;
1204 cred->cr_ops = &gss_nullops; 1218 cred->cr_ops = &gss_nullops;
1205 1219
1206 /* Take a reference to ensure the cred will be destroyed either 1220 /* Take a reference to ensure the cred will be destroyed either
@@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
1225 1239
1226 gss_delete_sec_context(&ctx->gc_gss_ctx); 1240 gss_delete_sec_context(&ctx->gc_gss_ctx);
1227 kfree(ctx->gc_wire_ctx.data); 1241 kfree(ctx->gc_wire_ctx.data);
1242 kfree(ctx->gc_acceptor.data);
1228 kfree(ctx); 1243 kfree(ctx);
1229} 1244}
1230 1245
@@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
1260{ 1275{
1261 struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); 1276 struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
1262 struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); 1277 struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
1263 struct gss_cl_ctx *ctx = gss_cred->gc_ctx; 1278 struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
1264 1279
1265 RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); 1280 RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
1266 call_rcu(&cred->cr_rcu, gss_free_cred_callback); 1281 call_rcu(&cred->cr_rcu, gss_free_cred_callback);
@@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
1332 return err; 1347 return err;
1333} 1348}
1334 1349
1350static char *
1351gss_stringify_acceptor(struct rpc_cred *cred)
1352{
1353 char *string = NULL;
1354 struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
1355 struct gss_cl_ctx *ctx;
1356 struct xdr_netobj *acceptor;
1357
1358 rcu_read_lock();
1359 ctx = rcu_dereference(gss_cred->gc_ctx);
1360 if (!ctx)
1361 goto out;
1362
1363 acceptor = &ctx->gc_acceptor;
1364
1365 /* no point if there's no string */
1366 if (!acceptor->len)
1367 goto out;
1368
1369 string = kmalloc(acceptor->len + 1, GFP_KERNEL);
1370 if (!string)
1371 goto out;
1372
1373 memcpy(string, acceptor->data, acceptor->len);
1374 string[acceptor->len] = '\0';
1375out:
1376 rcu_read_unlock();
1377 return string;
1378}
1379
1335/* 1380/*
1336 * Returns -EACCES if GSS context is NULL or will expire within the 1381 * Returns -EACCES if GSS context is NULL or will expire within the
1337 * timeout (miliseconds) 1382 * timeout (miliseconds)
@@ -1340,15 +1385,16 @@ static int
1340gss_key_timeout(struct rpc_cred *rc) 1385gss_key_timeout(struct rpc_cred *rc)
1341{ 1386{
1342 struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); 1387 struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
1388 struct gss_cl_ctx *ctx;
1343 unsigned long now = jiffies; 1389 unsigned long now = jiffies;
1344 unsigned long expire; 1390 unsigned long expire;
1345 1391
1346 if (gss_cred->gc_ctx == NULL) 1392 rcu_read_lock();
1347 return -EACCES; 1393 ctx = rcu_dereference(gss_cred->gc_ctx);
1348 1394 if (ctx)
1349 expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ); 1395 expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
1350 1396 rcu_read_unlock();
1351 if (time_after(now, expire)) 1397 if (!ctx || time_after(now, expire))
1352 return -EACCES; 1398 return -EACCES;
1353 return 0; 1399 return 0;
1354} 1400}
@@ -1357,13 +1403,19 @@ static int
1357gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) 1403gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
1358{ 1404{
1359 struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); 1405 struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
1406 struct gss_cl_ctx *ctx;
1360 int ret; 1407 int ret;
1361 1408
1362 if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) 1409 if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
1363 goto out; 1410 goto out;
1364 /* Don't match with creds that have expired. */ 1411 /* Don't match with creds that have expired. */
1365 if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) 1412 rcu_read_lock();
1413 ctx = rcu_dereference(gss_cred->gc_ctx);
1414 if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
1415 rcu_read_unlock();
1366 return 0; 1416 return 0;
1417 }
1418 rcu_read_unlock();
1367 if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) 1419 if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
1368 return 0; 1420 return 0;
1369out: 1421out:
@@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = {
1909}; 1961};
1910 1962
1911static const struct rpc_credops gss_credops = { 1963static const struct rpc_credops gss_credops = {
1912 .cr_name = "AUTH_GSS", 1964 .cr_name = "AUTH_GSS",
1913 .crdestroy = gss_destroy_cred, 1965 .crdestroy = gss_destroy_cred,
1914 .cr_init = gss_cred_init, 1966 .cr_init = gss_cred_init,
1915 .crbind = rpcauth_generic_bind_cred, 1967 .crbind = rpcauth_generic_bind_cred,
1916 .crmatch = gss_match, 1968 .crmatch = gss_match,
1917 .crmarshal = gss_marshal, 1969 .crmarshal = gss_marshal,
1918 .crrefresh = gss_refresh, 1970 .crrefresh = gss_refresh,
1919 .crvalidate = gss_validate, 1971 .crvalidate = gss_validate,
1920 .crwrap_req = gss_wrap_req, 1972 .crwrap_req = gss_wrap_req,
1921 .crunwrap_resp = gss_unwrap_resp, 1973 .crunwrap_resp = gss_unwrap_resp,
1922 .crkey_timeout = gss_key_timeout, 1974 .crkey_timeout = gss_key_timeout,
1975 .crstringify_acceptor = gss_stringify_acceptor,
1923}; 1976};
1924 1977
1925static const struct rpc_credops gss_nullops = { 1978static const struct rpc_credops gss_nullops = {
1926 .cr_name = "AUTH_GSS", 1979 .cr_name = "AUTH_GSS",
1927 .crdestroy = gss_destroy_nullcred, 1980 .crdestroy = gss_destroy_nullcred,
1928 .crbind = rpcauth_generic_bind_cred, 1981 .crbind = rpcauth_generic_bind_cred,
1929 .crmatch = gss_match, 1982 .crmatch = gss_match,
1930 .crmarshal = gss_marshal, 1983 .crmarshal = gss_marshal,
1931 .crrefresh = gss_refresh_null, 1984 .crrefresh = gss_refresh_null,
1932 .crvalidate = gss_validate, 1985 .crvalidate = gss_validate,
1933 .crwrap_req = gss_wrap_req, 1986 .crwrap_req = gss_wrap_req,
1934 .crunwrap_resp = gss_unwrap_resp, 1987 .crunwrap_resp = gss_unwrap_resp,
1988 .crstringify_acceptor = gss_stringify_acceptor,
1935}; 1989};
1936 1990
1937static const struct rpc_pipe_ops gss_upcall_ops_v0 = { 1991static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 0f43e894bc0a..f5ed9f6ece06 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -641,7 +641,7 @@ out:
641 641
642u32 642u32
643gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, 643gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
644 struct xdr_buf *buf, int ec, struct page **pages) 644 struct xdr_buf *buf, struct page **pages)
645{ 645{
646 u32 err; 646 u32 err;
647 struct xdr_netobj hmac; 647 struct xdr_netobj hmac;
@@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
684 ecptr = buf->tail[0].iov_base; 684 ecptr = buf->tail[0].iov_base;
685 } 685 }
686 686
687 memset(ecptr, 'X', ec);
688 buf->tail[0].iov_len += ec;
689 buf->len += ec;
690
691 /* copy plaintext gss token header after filler (if any) */ 687 /* copy plaintext gss token header after filler (if any) */
692 memcpy(ecptr + ec, buf->head[0].iov_base + offset, 688 memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
693 GSS_KRB5_TOK_HDR_LEN);
694 buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; 689 buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
695 buf->len += GSS_KRB5_TOK_HDR_LEN; 690 buf->len += GSS_KRB5_TOK_HDR_LEN;
696 691
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 62ae3273186c..42768e5c3994 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,31 +70,37 @@
70 70
71DEFINE_SPINLOCK(krb5_seq_lock); 71DEFINE_SPINLOCK(krb5_seq_lock);
72 72
73static char * 73static void *
74setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) 74setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
75{ 75{
76 __be16 *ptr, *krb5_hdr; 76 u16 *ptr;
77 void *krb5_hdr;
77 int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; 78 int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
78 79
79 token->len = g_token_size(&ctx->mech_used, body_size); 80 token->len = g_token_size(&ctx->mech_used, body_size);
80 81
81 ptr = (__be16 *)token->data; 82 ptr = (u16 *)token->data;
82 g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); 83 g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
83 84
84 /* ptr now at start of header described in rfc 1964, section 1.2.1: */ 85 /* ptr now at start of header described in rfc 1964, section 1.2.1: */
85 krb5_hdr = ptr; 86 krb5_hdr = ptr;
86 *ptr++ = KG_TOK_MIC_MSG; 87 *ptr++ = KG_TOK_MIC_MSG;
87 *ptr++ = cpu_to_le16(ctx->gk5e->signalg); 88 /*
89 * signalg is stored as if it were converted from LE to host endian, even
90 * though it's an opaque pair of bytes according to the RFC.
91 */
92 *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
88 *ptr++ = SEAL_ALG_NONE; 93 *ptr++ = SEAL_ALG_NONE;
89 *ptr++ = 0xffff; 94 *ptr = 0xffff;
90 95
91 return (char *)krb5_hdr; 96 return krb5_hdr;
92} 97}
93 98
94static void * 99static void *
95setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) 100setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
96{ 101{
97 __be16 *ptr, *krb5_hdr; 102 u16 *ptr;
103 void *krb5_hdr;
98 u8 *p, flags = 0x00; 104 u8 *p, flags = 0x00;
99 105
100 if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) 106 if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
@@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
104 110
105 /* Per rfc 4121, sec 4.2.6.1, there is no header, 111 /* Per rfc 4121, sec 4.2.6.1, there is no header,
106 * just start the token */ 112 * just start the token */
107 krb5_hdr = ptr = (__be16 *)token->data; 113 krb5_hdr = ptr = (u16 *)token->data;
108 114
109 *ptr++ = KG2_TOK_MIC; 115 *ptr++ = KG2_TOK_MIC;
110 p = (u8 *)ptr; 116 p = (u8 *)ptr;
111 *p++ = flags; 117 *p++ = flags;
112 *p++ = 0xff; 118 *p++ = 0xff;
113 ptr = (__be16 *)p; 119 ptr = (u16 *)p;
114 *ptr++ = 0xffff;
115 *ptr++ = 0xffff; 120 *ptr++ = 0xffff;
121 *ptr = 0xffff;
116 122
117 token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; 123 token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
118 return krb5_hdr; 124 return krb5_hdr;
@@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
181 spin_lock(&krb5_seq_lock); 187 spin_lock(&krb5_seq_lock);
182 seq_send = ctx->seq_send64++; 188 seq_send = ctx->seq_send64++;
183 spin_unlock(&krb5_seq_lock); 189 spin_unlock(&krb5_seq_lock);
184 *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); 190 *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
185 191
186 if (ctx->initiate) { 192 if (ctx->initiate) {
187 cksumkey = ctx->initiator_sign; 193 cksumkey = ctx->initiator_sign;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 42560e55d978..4b614c604fe0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
201 201
202 msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; 202 msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
203 203
204 *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); 204 /*
205 memset(ptr + 4, 0xff, 4); 205 * signalg and sealalg are stored as if they were converted from LE
206 *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); 206 * to host endian, even though they're opaque pairs of bytes according
207 * to the RFC.
208 */
209 *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
210 *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
211 ptr[6] = 0xff;
212 ptr[7] = 0xff;
207 213
208 gss_krb5_make_confounder(msg_start, conflen); 214 gss_krb5_make_confounder(msg_start, conflen);
209 215
@@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
438 u8 *ptr, *plainhdr; 444 u8 *ptr, *plainhdr;
439 s32 now; 445 s32 now;
440 u8 flags = 0x00; 446 u8 flags = 0x00;
441 __be16 *be16ptr, ec = 0; 447 __be16 *be16ptr;
442 __be64 *be64ptr; 448 __be64 *be64ptr;
443 u32 err; 449 u32 err;
444 450
@@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
468 be16ptr = (__be16 *)ptr; 474 be16ptr = (__be16 *)ptr;
469 475
470 blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); 476 blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
471 *be16ptr++ = cpu_to_be16(ec); 477 *be16ptr++ = 0;
472 /* "inner" token header always uses 0 for RRC */ 478 /* "inner" token header always uses 0 for RRC */
473 *be16ptr++ = cpu_to_be16(0); 479 *be16ptr++ = 0;
474 480
475 be64ptr = (__be64 *)be16ptr; 481 be64ptr = (__be64 *)be16ptr;
476 spin_lock(&krb5_seq_lock); 482 spin_lock(&krb5_seq_lock);
477 *be64ptr = cpu_to_be64(kctx->seq_send64++); 483 *be64ptr = cpu_to_be64(kctx->seq_send64++);
478 spin_unlock(&krb5_seq_lock); 484 spin_unlock(&krb5_seq_lock);
479 485
480 err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); 486 err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
481 if (err) 487 if (err)
482 return err; 488 return err;
483 489
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index f0ebe07978a2..712c123e04e9 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
35static struct rpc_cred * 35static struct rpc_cred *
36nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 36nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
37{ 37{
38 if (flags & RPCAUTH_LOOKUP_RCU)
39 return &null_cred;
38 return get_rpccred(&null_cred); 40 return get_rpccred(&null_cred);
39} 41}
40 42
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2e6ab10734f6..488ddeed9363 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task)
1746 case -EHOSTDOWN: 1746 case -EHOSTDOWN:
1747 case -EHOSTUNREACH: 1747 case -EHOSTUNREACH:
1748 case -ENETUNREACH: 1748 case -ENETUNREACH:
1749 case -ENOBUFS:
1749 case -EPIPE: 1750 case -EPIPE:
1750 dprintk("RPC: %5u remote rpcbind unreachable: %d\n", 1751 dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
1751 task->tk_pid, task->tk_status); 1752 task->tk_pid, task->tk_status);
@@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task)
1812 case -ECONNABORTED: 1813 case -ECONNABORTED:
1813 case -ENETUNREACH: 1814 case -ENETUNREACH:
1814 case -EHOSTUNREACH: 1815 case -EHOSTUNREACH:
1816 case -ENOBUFS:
1817 case -EPIPE:
1815 if (RPC_IS_SOFTCONN(task)) 1818 if (RPC_IS_SOFTCONN(task))
1816 break; 1819 break;
1817 /* retry with existing socket, after a delay */ 1820 /* retry with existing socket, after a delay */
@@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task)
1918 case -ECONNRESET: 1921 case -ECONNRESET:
1919 case -ECONNABORTED: 1922 case -ECONNABORTED:
1920 case -ENOTCONN: 1923 case -ENOTCONN:
1924 case -ENOBUFS:
1921 case -EPIPE: 1925 case -EPIPE:
1922 rpc_task_force_reencode(task); 1926 rpc_task_force_reencode(task);
1923 } 1927 }
@@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task)
2034 case -ECONNRESET: 2038 case -ECONNRESET:
2035 case -ECONNABORTED: 2039 case -ECONNABORTED:
2036 rpc_force_rebind(clnt); 2040 rpc_force_rebind(clnt);
2041 case -ENOBUFS:
2037 rpc_delay(task, 3*HZ); 2042 rpc_delay(task, 3*HZ);
2038 case -EPIPE: 2043 case -EPIPE:
2039 case -ENOTCONN: 2044 case -ENOTCONN:
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b18554898562..2d12b76b5a64 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -195,7 +195,7 @@ static struct inode *
195rpc_alloc_inode(struct super_block *sb) 195rpc_alloc_inode(struct super_block *sb)
196{ 196{
197 struct rpc_inode *rpci; 197 struct rpc_inode *rpci;
198 rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); 198 rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
199 if (!rpci) 199 if (!rpci)
200 return NULL; 200 return NULL;
201 return &rpci->vfs_inode; 201 return &rpci->vfs_inode;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 51c63165073c..56e4e150e80e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task)
744 case -ECONNABORTED: 744 case -ECONNABORTED:
745 case -ENETUNREACH: 745 case -ENETUNREACH:
746 case -EHOSTUNREACH: 746 case -EHOSTUNREACH:
747 case -EPIPE:
747 case -EAGAIN: 748 case -EAGAIN:
748 dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); 749 dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
749 break; 750 break;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 693966d3f33b..6166c985fe24 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
53# define RPCDBG_FACILITY RPCDBG_TRANS 53# define RPCDBG_FACILITY RPCDBG_TRANS
54#endif 54#endif
55 55
56enum rpcrdma_chunktype {
57 rpcrdma_noch = 0,
58 rpcrdma_readch,
59 rpcrdma_areadch,
60 rpcrdma_writech,
61 rpcrdma_replych
62};
63
64#ifdef RPC_DEBUG 56#ifdef RPC_DEBUG
65static const char transfertypes[][12] = { 57static const char transfertypes[][12] = {
66 "pure inline", /* no chunks */ 58 "pure inline", /* no chunks */
@@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
279 return (unsigned char *)iptr - (unsigned char *)headerp; 271 return (unsigned char *)iptr - (unsigned char *)headerp;
280 272
281out: 273out:
282 for (pos = 0; nchunks--;) 274 if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
283 pos += rpcrdma_deregister_external( 275 for (pos = 0; nchunks--;)
284 &req->rl_segments[pos], r_xprt); 276 pos += rpcrdma_deregister_external(
277 &req->rl_segments[pos], r_xprt);
278 }
285 return n; 279 return n;
286} 280}
287 281
288/* 282/*
283 * Marshal chunks. This routine returns the header length
284 * consumed by marshaling.
285 *
286 * Returns positive RPC/RDMA header size, or negative errno.
287 */
288
289ssize_t
290rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
291{
292 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
293 struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
294
295 if (req->rl_rtype != rpcrdma_noch)
296 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
297 headerp, req->rl_rtype);
298 else if (req->rl_wtype != rpcrdma_noch)
299 result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
300 headerp, req->rl_wtype);
301 return result;
302}
303
304/*
289 * Copy write data inline. 305 * Copy write data inline.
290 * This function is used for "small" requests. Data which is passed 306 * This function is used for "small" requests. Data which is passed
291 * to RPC via iovecs (or page list) is copied directly into the 307 * to RPC via iovecs (or page list) is copied directly into the
@@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
377 char *base; 393 char *base;
378 size_t rpclen, padlen; 394 size_t rpclen, padlen;
379 ssize_t hdrlen; 395 ssize_t hdrlen;
380 enum rpcrdma_chunktype rtype, wtype;
381 struct rpcrdma_msg *headerp; 396 struct rpcrdma_msg *headerp;
382 397
383 /* 398 /*
@@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
415 * into pages; otherwise use reply chunks. 430 * into pages; otherwise use reply chunks.
416 */ 431 */
417 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) 432 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
418 wtype = rpcrdma_noch; 433 req->rl_wtype = rpcrdma_noch;
419 else if (rqst->rq_rcv_buf.page_len == 0) 434 else if (rqst->rq_rcv_buf.page_len == 0)
420 wtype = rpcrdma_replych; 435 req->rl_wtype = rpcrdma_replych;
421 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 436 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
422 wtype = rpcrdma_writech; 437 req->rl_wtype = rpcrdma_writech;
423 else 438 else
424 wtype = rpcrdma_replych; 439 req->rl_wtype = rpcrdma_replych;
425 440
426 /* 441 /*
427 * Chunks needed for arguments? 442 * Chunks needed for arguments?
@@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
438 * TBD check NFSv4 setacl 453 * TBD check NFSv4 setacl
439 */ 454 */
440 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) 455 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
441 rtype = rpcrdma_noch; 456 req->rl_rtype = rpcrdma_noch;
442 else if (rqst->rq_snd_buf.page_len == 0) 457 else if (rqst->rq_snd_buf.page_len == 0)
443 rtype = rpcrdma_areadch; 458 req->rl_rtype = rpcrdma_areadch;
444 else 459 else
445 rtype = rpcrdma_readch; 460 req->rl_rtype = rpcrdma_readch;
446 461
447 /* The following simplification is not true forever */ 462 /* The following simplification is not true forever */
448 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) 463 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
449 wtype = rpcrdma_noch; 464 req->rl_wtype = rpcrdma_noch;
450 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { 465 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
451 dprintk("RPC: %s: cannot marshal multiple chunk lists\n", 466 dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
452 __func__); 467 __func__);
453 return -EIO; 468 return -EIO;
@@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
461 * When padding is in use and applies to the transfer, insert 476 * When padding is in use and applies to the transfer, insert
462 * it and change the message type. 477 * it and change the message type.
463 */ 478 */
464 if (rtype == rpcrdma_noch) { 479 if (req->rl_rtype == rpcrdma_noch) {
465 480
466 padlen = rpcrdma_inline_pullup(rqst, 481 padlen = rpcrdma_inline_pullup(rqst,
467 RPCRDMA_INLINE_PAD_VALUE(rqst)); 482 RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
476 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 491 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
477 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 492 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
478 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 493 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
479 if (wtype != rpcrdma_noch) { 494 if (req->rl_wtype != rpcrdma_noch) {
480 dprintk("RPC: %s: invalid chunk list\n", 495 dprintk("RPC: %s: invalid chunk list\n",
481 __func__); 496 __func__);
482 return -EIO; 497 return -EIO;
@@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
497 * on receive. Therefore, we request a reply chunk 512 * on receive. Therefore, we request a reply chunk
498 * for non-writes wherever feasible and efficient. 513 * for non-writes wherever feasible and efficient.
499 */ 514 */
500 if (wtype == rpcrdma_noch) 515 if (req->rl_wtype == rpcrdma_noch)
501 wtype = rpcrdma_replych; 516 req->rl_wtype = rpcrdma_replych;
502 } 517 }
503 } 518 }
504 519
505 /* 520 hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
506 * Marshal chunks. This routine will return the header length
507 * consumed by marshaling.
508 */
509 if (rtype != rpcrdma_noch) {
510 hdrlen = rpcrdma_create_chunks(rqst,
511 &rqst->rq_snd_buf, headerp, rtype);
512 wtype = rtype; /* simplify dprintk */
513
514 } else if (wtype != rpcrdma_noch) {
515 hdrlen = rpcrdma_create_chunks(rqst,
516 &rqst->rq_rcv_buf, headerp, wtype);
517 }
518 if (hdrlen < 0) 521 if (hdrlen < 0)
519 return hdrlen; 522 return hdrlen;
520 523
521 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 524 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
522 " headerp 0x%p base 0x%p lkey 0x%x\n", 525 " headerp 0x%p base 0x%p lkey 0x%x\n",
523 __func__, transfertypes[wtype], hdrlen, rpclen, padlen, 526 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
524 headerp, base, req->rl_iov.lkey); 527 headerp, base, req->rl_iov.lkey);
525 528
526 /* 529 /*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 66f91f0d071a..2faac4940563 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args)
296 296
297 xprt->resvport = 0; /* privileged port not needed */ 297 xprt->resvport = 0; /* privileged port not needed */
298 xprt->tsh_size = 0; /* RPC-RDMA handles framing */ 298 xprt->tsh_size = 0; /* RPC-RDMA handles framing */
299 xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
300 xprt->ops = &xprt_rdma_procs; 299 xprt->ops = &xprt_rdma_procs;
301 300
302 /* 301 /*
@@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args)
382 new_ep->rep_xprt = xprt; 381 new_ep->rep_xprt = xprt;
383 382
384 xprt_rdma_format_addresses(xprt); 383 xprt_rdma_format_addresses(xprt);
384 xprt->max_payload = rpcrdma_max_payload(new_xprt);
385 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
386 __func__, xprt->max_payload);
385 387
386 if (!try_module_get(THIS_MODULE)) 388 if (!try_module_get(THIS_MODULE))
387 goto out4; 389 goto out4;
@@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
412 if (r_xprt->rx_ep.rep_connected > 0) 414 if (r_xprt->rx_ep.rep_connected > 0)
413 xprt->reestablish_timeout = 0; 415 xprt->reestablish_timeout = 0;
414 xprt_disconnect_done(xprt); 416 xprt_disconnect_done(xprt);
415 (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); 417 rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
416} 418}
417 419
418static void 420static void
@@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task)
595 struct rpc_xprt *xprt = rqst->rq_xprt; 597 struct rpc_xprt *xprt = rqst->rq_xprt;
596 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 598 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
597 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 599 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
598 int rc; 600 int rc = 0;
599 601
600 if (req->rl_niovs == 0) { 602 if (req->rl_niovs == 0)
601 rc = rpcrdma_marshal_req(rqst); 603 rc = rpcrdma_marshal_req(rqst);
602 if (rc < 0) 604 else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
603 goto failed_marshal; 605 rc = rpcrdma_marshal_chunks(rqst, 0);
604 } 606 if (rc < 0)
607 goto failed_marshal;
605 608
606 if (req->rl_reply == NULL) /* e.g. reconnection */ 609 if (req->rl_reply == NULL) /* e.g. reconnection */
607 rpcrdma_recv_buffer_get(req); 610 rpcrdma_recv_buffer_get(req);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 13dbd1c389ff..61c41298b4ea 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -61,6 +61,8 @@
61# define RPCDBG_FACILITY RPCDBG_TRANS 61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif 62#endif
63 63
64static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
65
64/* 66/*
65 * internal functions 67 * internal functions
66 */ 68 */
@@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data)
103 105
104static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); 106static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105 107
106static inline void
107rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108{
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115}
116
117static void 108static void
118rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 109rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119{ 110{
@@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
153 if (wc->wr_id == 0ULL) 144 if (wc->wr_id == 0ULL)
154 return; 145 return;
155 if (wc->status != IB_WC_SUCCESS) 146 if (wc->status != IB_WC_SUCCESS)
156 return; 147 frmr->r.frmr.fr_state = FRMR_IS_STALE;
157
158 if (wc->opcode == IB_WC_FAST_REG_MR)
159 frmr->r.frmr.state = FRMR_IS_VALID;
160 else if (wc->opcode == IB_WC_LOCAL_INV)
161 frmr->r.frmr.state = FRMR_IS_INVALID;
162} 148}
163 149
164static int 150static int
@@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
217} 203}
218 204
219static void 205static void
220rpcrdma_recvcq_process_wc(struct ib_wc *wc) 206rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
221{ 207{
222 struct rpcrdma_rep *rep = 208 struct rpcrdma_rep *rep =
223 (struct rpcrdma_rep *)(unsigned long)wc->wr_id; 209 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc)
248 } 234 }
249 235
250out_schedule: 236out_schedule:
251 rpcrdma_schedule_tasklet(rep); 237 list_add_tail(&rep->rr_list, sched_list);
252} 238}
253 239
254static int 240static int
255rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) 241rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
256{ 242{
243 struct list_head sched_list;
257 struct ib_wc *wcs; 244 struct ib_wc *wcs;
258 int budget, count, rc; 245 int budget, count, rc;
246 unsigned long flags;
259 247
248 INIT_LIST_HEAD(&sched_list);
260 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; 249 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
261 do { 250 do {
262 wcs = ep->rep_recv_wcs; 251 wcs = ep->rep_recv_wcs;
263 252
264 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); 253 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
265 if (rc <= 0) 254 if (rc <= 0)
266 return rc; 255 goto out_schedule;
267 256
268 count = rc; 257 count = rc;
269 while (count-- > 0) 258 while (count-- > 0)
270 rpcrdma_recvcq_process_wc(wcs++); 259 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
271 } while (rc == RPCRDMA_POLLSIZE && --budget); 260 } while (rc == RPCRDMA_POLLSIZE && --budget);
272 return 0; 261 rc = 0;
262
263out_schedule:
264 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
265 list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
266 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
267 tasklet_schedule(&rpcrdma_tasklet_g);
268 return rc;
273} 269}
274 270
275/* 271/*
@@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
310 rpcrdma_recvcq_poll(cq, ep); 306 rpcrdma_recvcq_poll(cq, ep);
311} 307}
312 308
309static void
310rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
311{
312 rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
313 rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
314}
315
313#ifdef RPC_DEBUG 316#ifdef RPC_DEBUG
314static const char * const conn[] = { 317static const char * const conn[] = {
315 "address resolved", 318 "address resolved",
@@ -323,8 +326,16 @@ static const char * const conn[] = {
323 "rejected", 326 "rejected",
324 "established", 327 "established",
325 "disconnected", 328 "disconnected",
326 "device removal" 329 "device removal",
330 "multicast join",
331 "multicast error",
332 "address change",
333 "timewait exit",
327}; 334};
335
336#define CONNECTION_MSG(status) \
337 ((status) < ARRAY_SIZE(conn) ? \
338 conn[(status)] : "unrecognized connection error")
328#endif 339#endif
329 340
330static int 341static int
@@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
382 case RDMA_CM_EVENT_DEVICE_REMOVAL: 393 case RDMA_CM_EVENT_DEVICE_REMOVAL:
383 connstate = -ENODEV; 394 connstate = -ENODEV;
384connected: 395connected:
385 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
386 __func__,
387 (event->event <= 11) ? conn[event->event] :
388 "unknown connection error",
389 &addr->sin_addr.s_addr,
390 ntohs(addr->sin_port),
391 ep, event->event);
392 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); 396 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
393 dprintk("RPC: %s: %sconnected\n", 397 dprintk("RPC: %s: %sconnected\n",
394 __func__, connstate > 0 ? "" : "dis"); 398 __func__, connstate > 0 ? "" : "dis");
395 ep->rep_connected = connstate; 399 ep->rep_connected = connstate;
396 ep->rep_func(ep); 400 ep->rep_func(ep);
397 wake_up_all(&ep->rep_connect_wait); 401 wake_up_all(&ep->rep_connect_wait);
398 break; 402 /*FALLTHROUGH*/
399 default: 403 default:
400 dprintk("RPC: %s: unexpected CM event %d\n", 404 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
401 __func__, event->event); 405 __func__, &addr->sin_addr.s_addr,
406 ntohs(addr->sin_port), ep,
407 CONNECTION_MSG(event->event));
402 break; 408 break;
403 } 409 }
404 410
@@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
558 if (!ia->ri_id->device->alloc_fmr) { 564 if (!ia->ri_id->device->alloc_fmr) {
559 dprintk("RPC: %s: MTHCAFMR registration " 565 dprintk("RPC: %s: MTHCAFMR registration "
560 "not supported by HCA\n", __func__); 566 "not supported by HCA\n", __func__);
561#if RPCRDMA_PERSISTENT_REGISTRATION
562 memreg = RPCRDMA_ALLPHYSICAL; 567 memreg = RPCRDMA_ALLPHYSICAL;
563#else
564 rc = -ENOMEM;
565 goto out2;
566#endif
567 } 568 }
568 } 569 }
569 570
@@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
578 switch (memreg) { 579 switch (memreg) {
579 case RPCRDMA_FRMR: 580 case RPCRDMA_FRMR:
580 break; 581 break;
581#if RPCRDMA_PERSISTENT_REGISTRATION
582 case RPCRDMA_ALLPHYSICAL: 582 case RPCRDMA_ALLPHYSICAL:
583 mem_priv = IB_ACCESS_LOCAL_WRITE | 583 mem_priv = IB_ACCESS_LOCAL_WRITE |
584 IB_ACCESS_REMOTE_WRITE | 584 IB_ACCESS_REMOTE_WRITE |
585 IB_ACCESS_REMOTE_READ; 585 IB_ACCESS_REMOTE_READ;
586 goto register_setup; 586 goto register_setup;
587#endif
588 case RPCRDMA_MTHCAFMR: 587 case RPCRDMA_MTHCAFMR:
589 if (ia->ri_have_dma_lkey) 588 if (ia->ri_have_dma_lkey)
590 break; 589 break;
591 mem_priv = IB_ACCESS_LOCAL_WRITE; 590 mem_priv = IB_ACCESS_LOCAL_WRITE;
592#if RPCRDMA_PERSISTENT_REGISTRATION
593 register_setup: 591 register_setup:
594#endif
595 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 592 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
596 if (IS_ERR(ia->ri_bind_mem)) { 593 if (IS_ERR(ia->ri_bind_mem)) {
597 printk(KERN_ALERT "%s: ib_get_dma_mr for " 594 printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
613 /* Else will do memory reg/dereg for each chunk */ 610 /* Else will do memory reg/dereg for each chunk */
614 ia->ri_memreg_strategy = memreg; 611 ia->ri_memreg_strategy = memreg;
615 612
613 rwlock_init(&ia->ri_qplock);
616 return 0; 614 return 0;
617out2: 615out2:
618 rdma_destroy_id(ia->ri_id); 616 rdma_destroy_id(ia->ri_id);
@@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
826 cancel_delayed_work_sync(&ep->rep_connect_worker); 824 cancel_delayed_work_sync(&ep->rep_connect_worker);
827 825
828 if (ia->ri_id->qp) { 826 if (ia->ri_id->qp) {
829 rc = rpcrdma_ep_disconnect(ep, ia); 827 rpcrdma_ep_disconnect(ep, ia);
830 if (rc)
831 dprintk("RPC: %s: rpcrdma_ep_disconnect"
832 " returned %i\n", __func__, rc);
833 rdma_destroy_qp(ia->ri_id); 828 rdma_destroy_qp(ia->ri_id);
834 ia->ri_id->qp = NULL; 829 ia->ri_id->qp = NULL;
835 } 830 }
@@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
859int 854int
860rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 855rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
861{ 856{
862 struct rdma_cm_id *id; 857 struct rdma_cm_id *id, *old;
863 int rc = 0; 858 int rc = 0;
864 int retry_count = 0; 859 int retry_count = 0;
865 860
@@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
867 struct rpcrdma_xprt *xprt; 862 struct rpcrdma_xprt *xprt;
868retry: 863retry:
869 dprintk("RPC: %s: reconnecting...\n", __func__); 864 dprintk("RPC: %s: reconnecting...\n", __func__);
870 rc = rpcrdma_ep_disconnect(ep, ia);
871 if (rc && rc != -ENOTCONN)
872 dprintk("RPC: %s: rpcrdma_ep_disconnect"
873 " status %i\n", __func__, rc);
874 865
875 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 866 rpcrdma_ep_disconnect(ep, ia);
876 rpcrdma_clean_cq(ep->rep_attr.send_cq); 867 rpcrdma_flush_cqs(ep);
868
869 if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
870 rpcrdma_reset_frmrs(ia);
877 871
878 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 872 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
879 id = rpcrdma_create_id(xprt, ia, 873 id = rpcrdma_create_id(xprt, ia,
@@ -905,9 +899,14 @@ retry:
905 rc = -ENETUNREACH; 899 rc = -ENETUNREACH;
906 goto out; 900 goto out;
907 } 901 }
908 rdma_destroy_qp(ia->ri_id); 902
909 rdma_destroy_id(ia->ri_id); 903 write_lock(&ia->ri_qplock);
904 old = ia->ri_id;
910 ia->ri_id = id; 905 ia->ri_id = id;
906 write_unlock(&ia->ri_qplock);
907
908 rdma_destroy_qp(old);
909 rdma_destroy_id(old);
911 } else { 910 } else {
912 dprintk("RPC: %s: connecting...\n", __func__); 911 dprintk("RPC: %s: connecting...\n", __func__);
913 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 912 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -974,13 +973,12 @@ out:
974 * This call is not reentrant, and must not be made in parallel 973 * This call is not reentrant, and must not be made in parallel
975 * on the same endpoint. 974 * on the same endpoint.
976 */ 975 */
977int 976void
978rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 977rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
979{ 978{
980 int rc; 979 int rc;
981 980
982 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 981 rpcrdma_flush_cqs(ep);
983 rpcrdma_clean_cq(ep->rep_attr.send_cq);
984 rc = rdma_disconnect(ia->ri_id); 982 rc = rdma_disconnect(ia->ri_id);
985 if (!rc) { 983 if (!rc) {
986 /* returns without wait if not connected */ 984 /* returns without wait if not connected */
@@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
992 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 990 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
993 ep->rep_connected = rc; 991 ep->rep_connected = rc;
994 } 992 }
993}
994
995static int
996rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
997{
998 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
999 struct ib_fmr_attr fmr_attr = {
1000 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1001 .max_maps = 1,
1002 .page_shift = PAGE_SHIFT
1003 };
1004 struct rpcrdma_mw *r;
1005 int i, rc;
1006
1007 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1008 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1009
1010 while (i--) {
1011 r = kzalloc(sizeof(*r), GFP_KERNEL);
1012 if (r == NULL)
1013 return -ENOMEM;
1014
1015 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1016 if (IS_ERR(r->r.fmr)) {
1017 rc = PTR_ERR(r->r.fmr);
1018 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1019 __func__, rc);
1020 goto out_free;
1021 }
1022
1023 list_add(&r->mw_list, &buf->rb_mws);
1024 list_add(&r->mw_all, &buf->rb_all);
1025 }
1026 return 0;
1027
1028out_free:
1029 kfree(r);
1030 return rc;
1031}
1032
1033static int
1034rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1035{
1036 struct rpcrdma_frmr *f;
1037 struct rpcrdma_mw *r;
1038 int i, rc;
1039
1040 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1041 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1042
1043 while (i--) {
1044 r = kzalloc(sizeof(*r), GFP_KERNEL);
1045 if (r == NULL)
1046 return -ENOMEM;
1047 f = &r->r.frmr;
1048
1049 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1050 ia->ri_max_frmr_depth);
1051 if (IS_ERR(f->fr_mr)) {
1052 rc = PTR_ERR(f->fr_mr);
1053 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1054 "failed %i\n", __func__, rc);
1055 goto out_free;
1056 }
1057
1058 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1059 ia->ri_max_frmr_depth);
1060 if (IS_ERR(f->fr_pgl)) {
1061 rc = PTR_ERR(f->fr_pgl);
1062 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1063 "failed %i\n", __func__, rc);
1064
1065 ib_dereg_mr(f->fr_mr);
1066 goto out_free;
1067 }
1068
1069 list_add(&r->mw_list, &buf->rb_mws);
1070 list_add(&r->mw_all, &buf->rb_all);
1071 }
1072
1073 return 0;
1074
1075out_free:
1076 kfree(r);
995 return rc; 1077 return rc;
996} 1078}
997 1079
998/*
999 * Initialize buffer memory
1000 */
1001int 1080int
1002rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 1081rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1003 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 1082 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
@@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1005 char *p; 1084 char *p;
1006 size_t len, rlen, wlen; 1085 size_t len, rlen, wlen;
1007 int i, rc; 1086 int i, rc;
1008 struct rpcrdma_mw *r;
1009 1087
1010 buf->rb_max_requests = cdata->max_requests; 1088 buf->rb_max_requests = cdata->max_requests;
1011 spin_lock_init(&buf->rb_lock); 1089 spin_lock_init(&buf->rb_lock);
@@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1016 * 2. arrays of struct rpcrdma_req to fill in pointers 1094 * 2. arrays of struct rpcrdma_req to fill in pointers
1017 * 3. array of struct rpcrdma_rep for replies 1095 * 3. array of struct rpcrdma_rep for replies
1018 * 4. padding, if any 1096 * 4. padding, if any
1019 * 5. mw's, fmr's or frmr's, if any
1020 * Send/recv buffers in req/rep need to be registered 1097 * Send/recv buffers in req/rep need to be registered
1021 */ 1098 */
1022
1023 len = buf->rb_max_requests * 1099 len = buf->rb_max_requests *
1024 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 1100 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1025 len += cdata->padding; 1101 len += cdata->padding;
1026 switch (ia->ri_memreg_strategy) {
1027 case RPCRDMA_FRMR:
1028 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
1029 sizeof(struct rpcrdma_mw);
1030 break;
1031 case RPCRDMA_MTHCAFMR:
1032 /* TBD we are perhaps overallocating here */
1033 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1034 sizeof(struct rpcrdma_mw);
1035 break;
1036 default:
1037 break;
1038 }
1039 1102
1040 /* allocate 1, 4 and 5 in one shot */
1041 p = kzalloc(len, GFP_KERNEL); 1103 p = kzalloc(len, GFP_KERNEL);
1042 if (p == NULL) { 1104 if (p == NULL) {
1043 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", 1105 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1064 p += cdata->padding; 1126 p += cdata->padding;
1065 1127
1066 INIT_LIST_HEAD(&buf->rb_mws); 1128 INIT_LIST_HEAD(&buf->rb_mws);
1067 r = (struct rpcrdma_mw *)p; 1129 INIT_LIST_HEAD(&buf->rb_all);
1068 switch (ia->ri_memreg_strategy) { 1130 switch (ia->ri_memreg_strategy) {
1069 case RPCRDMA_FRMR: 1131 case RPCRDMA_FRMR:
1070 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { 1132 rc = rpcrdma_init_frmrs(ia, buf);
1071 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, 1133 if (rc)
1072 ia->ri_max_frmr_depth); 1134 goto out;
1073 if (IS_ERR(r->r.frmr.fr_mr)) {
1074 rc = PTR_ERR(r->r.frmr.fr_mr);
1075 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1076 " failed %i\n", __func__, rc);
1077 goto out;
1078 }
1079 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1080 ia->ri_id->device,
1081 ia->ri_max_frmr_depth);
1082 if (IS_ERR(r->r.frmr.fr_pgl)) {
1083 rc = PTR_ERR(r->r.frmr.fr_pgl);
1084 dprintk("RPC: %s: "
1085 "ib_alloc_fast_reg_page_list "
1086 "failed %i\n", __func__, rc);
1087
1088 ib_dereg_mr(r->r.frmr.fr_mr);
1089 goto out;
1090 }
1091 list_add(&r->mw_list, &buf->rb_mws);
1092 ++r;
1093 }
1094 break; 1135 break;
1095 case RPCRDMA_MTHCAFMR: 1136 case RPCRDMA_MTHCAFMR:
1096 /* TBD we are perhaps overallocating here */ 1137 rc = rpcrdma_init_fmrs(ia, buf);
1097 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1138 if (rc)
1098 static struct ib_fmr_attr fa = 1139 goto out;
1099 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1100 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1101 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1102 &fa);
1103 if (IS_ERR(r->r.fmr)) {
1104 rc = PTR_ERR(r->r.fmr);
1105 dprintk("RPC: %s: ib_alloc_fmr"
1106 " failed %i\n", __func__, rc);
1107 goto out;
1108 }
1109 list_add(&r->mw_list, &buf->rb_mws);
1110 ++r;
1111 }
1112 break; 1140 break;
1113 default: 1141 default:
1114 break; 1142 break;
@@ -1176,24 +1204,57 @@ out:
1176 return rc; 1204 return rc;
1177} 1205}
1178 1206
1179/* 1207static void
1180 * Unregister and destroy buffer memory. Need to deal with 1208rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1181 * partial initialization, so it's callable from failed create. 1209{
1182 * Must be called before destroying endpoint, as registrations 1210 struct rpcrdma_mw *r;
1183 * reference it. 1211 int rc;
1184 */ 1212
1213 while (!list_empty(&buf->rb_all)) {
1214 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1215 list_del(&r->mw_all);
1216 list_del(&r->mw_list);
1217
1218 rc = ib_dealloc_fmr(r->r.fmr);
1219 if (rc)
1220 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1221 __func__, rc);
1222
1223 kfree(r);
1224 }
1225}
1226
1227static void
1228rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1229{
1230 struct rpcrdma_mw *r;
1231 int rc;
1232
1233 while (!list_empty(&buf->rb_all)) {
1234 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1235 list_del(&r->mw_all);
1236 list_del(&r->mw_list);
1237
1238 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1239 if (rc)
1240 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1241 __func__, rc);
1242 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1243
1244 kfree(r);
1245 }
1246}
1247
1185void 1248void
1186rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1249rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1187{ 1250{
1188 int rc, i;
1189 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1251 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1190 struct rpcrdma_mw *r; 1252 int i;
1191 1253
1192 /* clean up in reverse order from create 1254 /* clean up in reverse order from create
1193 * 1. recv mr memory (mr free, then kfree) 1255 * 1. recv mr memory (mr free, then kfree)
1194 * 2. send mr memory (mr free, then kfree) 1256 * 2. send mr memory (mr free, then kfree)
1195 * 3. padding (if any) [moved to rpcrdma_ep_destroy] 1257 * 3. MWs
1196 * 4. arrays
1197 */ 1258 */
1198 dprintk("RPC: %s: entering\n", __func__); 1259 dprintk("RPC: %s: entering\n", __func__);
1199 1260
@@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1212 } 1273 }
1213 } 1274 }
1214 1275
1276 switch (ia->ri_memreg_strategy) {
1277 case RPCRDMA_FRMR:
1278 rpcrdma_destroy_frmrs(buf);
1279 break;
1280 case RPCRDMA_MTHCAFMR:
1281 rpcrdma_destroy_fmrs(buf);
1282 break;
1283 default:
1284 break;
1285 }
1286
1287 kfree(buf->rb_pool);
1288}
1289
1290/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1291 * an unusable state. Find FRMRs in this state and dereg / reg
1292 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1293 * also torn down.
1294 *
1295 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1296 *
1297 * This is invoked only in the transport connect worker in order
1298 * to serialize with rpcrdma_register_frmr_external().
1299 */
1300static void
1301rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1302{
1303 struct rpcrdma_xprt *r_xprt =
1304 container_of(ia, struct rpcrdma_xprt, rx_ia);
1305 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1306 struct list_head *pos;
1307 struct rpcrdma_mw *r;
1308 int rc;
1309
1310 list_for_each(pos, &buf->rb_all) {
1311 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1312
1313 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1314 continue;
1315
1316 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1317 if (rc)
1318 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1319 __func__, rc);
1320 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1321
1322 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1323 ia->ri_max_frmr_depth);
1324 if (IS_ERR(r->r.frmr.fr_mr)) {
1325 rc = PTR_ERR(r->r.frmr.fr_mr);
1326 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1327 " failed %i\n", __func__, rc);
1328 continue;
1329 }
1330 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1331 ia->ri_id->device,
1332 ia->ri_max_frmr_depth);
1333 if (IS_ERR(r->r.frmr.fr_pgl)) {
1334 rc = PTR_ERR(r->r.frmr.fr_pgl);
1335 dprintk("RPC: %s: "
1336 "ib_alloc_fast_reg_page_list "
1337 "failed %i\n", __func__, rc);
1338
1339 ib_dereg_mr(r->r.frmr.fr_mr);
1340 continue;
1341 }
1342 r->r.frmr.fr_state = FRMR_IS_INVALID;
1343 }
1344}
1345
1346/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1347 * some req segments uninitialized.
1348 */
1349static void
1350rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1351{
1352 if (*mw) {
1353 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1354 *mw = NULL;
1355 }
1356}
1357
1358/* Cycle mw's back in reverse order, and "spin" them.
1359 * This delays and scrambles reuse as much as possible.
1360 */
1361static void
1362rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1363{
1364 struct rpcrdma_mr_seg *seg = req->rl_segments;
1365 struct rpcrdma_mr_seg *seg1 = seg;
1366 int i;
1367
1368 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1369 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
1370 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
1371}
1372
1373static void
1374rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1375{
1376 buf->rb_send_bufs[--buf->rb_send_index] = req;
1377 req->rl_niovs = 0;
1378 if (req->rl_reply) {
1379 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1380 req->rl_reply->rr_func = NULL;
1381 req->rl_reply = NULL;
1382 }
1383}
1384
1385/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1386 * Redo only the ib_post_send().
1387 */
1388static void
1389rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1390{
1391 struct rpcrdma_xprt *r_xprt =
1392 container_of(ia, struct rpcrdma_xprt, rx_ia);
1393 struct ib_send_wr invalidate_wr, *bad_wr;
1394 int rc;
1395
1396 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1397
1398 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1399 r->r.frmr.fr_state = FRMR_IS_INVALID;
1400
1401 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1402 invalidate_wr.wr_id = (unsigned long)(void *)r;
1403 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1404 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1405 DECR_CQCOUNT(&r_xprt->rx_ep);
1406
1407 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1408 __func__, r, r->r.frmr.fr_mr->rkey);
1409
1410 read_lock(&ia->ri_qplock);
1411 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1412 read_unlock(&ia->ri_qplock);
1413 if (rc) {
1414 /* Force rpcrdma_buffer_get() to retry */
1415 r->r.frmr.fr_state = FRMR_IS_STALE;
1416 dprintk("RPC: %s: ib_post_send failed, %i\n",
1417 __func__, rc);
1418 }
1419}
1420
1421static void
1422rpcrdma_retry_flushed_linv(struct list_head *stale,
1423 struct rpcrdma_buffer *buf)
1424{
1425 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1426 struct list_head *pos;
1427 struct rpcrdma_mw *r;
1428 unsigned long flags;
1429
1430 list_for_each(pos, stale) {
1431 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1432 rpcrdma_retry_local_inv(r, ia);
1433 }
1434
1435 spin_lock_irqsave(&buf->rb_lock, flags);
1436 list_splice_tail(stale, &buf->rb_mws);
1437 spin_unlock_irqrestore(&buf->rb_lock, flags);
1438}
1439
1440static struct rpcrdma_req *
1441rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1442 struct list_head *stale)
1443{
1444 struct rpcrdma_mw *r;
1445 int i;
1446
1447 i = RPCRDMA_MAX_SEGS - 1;
1215 while (!list_empty(&buf->rb_mws)) { 1448 while (!list_empty(&buf->rb_mws)) {
1216 r = list_entry(buf->rb_mws.next, 1449 r = list_entry(buf->rb_mws.next,
1217 struct rpcrdma_mw, mw_list); 1450 struct rpcrdma_mw, mw_list);
1218 list_del(&r->mw_list); 1451 list_del(&r->mw_list);
1219 switch (ia->ri_memreg_strategy) { 1452 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1220 case RPCRDMA_FRMR: 1453 list_add(&r->mw_list, stale);
1221 rc = ib_dereg_mr(r->r.frmr.fr_mr); 1454 continue;
1222 if (rc)
1223 dprintk("RPC: %s:"
1224 " ib_dereg_mr"
1225 " failed %i\n",
1226 __func__, rc);
1227 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1228 break;
1229 case RPCRDMA_MTHCAFMR:
1230 rc = ib_dealloc_fmr(r->r.fmr);
1231 if (rc)
1232 dprintk("RPC: %s:"
1233 " ib_dealloc_fmr"
1234 " failed %i\n",
1235 __func__, rc);
1236 break;
1237 default:
1238 break;
1239 } 1455 }
1456 req->rl_segments[i].mr_chunk.rl_mw = r;
1457 if (unlikely(i-- == 0))
1458 return req; /* Success */
1240 } 1459 }
1241 1460
1242 kfree(buf->rb_pool); 1461 /* Not enough entries on rb_mws for this req */
1462 rpcrdma_buffer_put_sendbuf(req, buf);
1463 rpcrdma_buffer_put_mrs(req, buf);
1464 return NULL;
1465}
1466
1467static struct rpcrdma_req *
1468rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1469{
1470 struct rpcrdma_mw *r;
1471 int i;
1472
1473 i = RPCRDMA_MAX_SEGS - 1;
1474 while (!list_empty(&buf->rb_mws)) {
1475 r = list_entry(buf->rb_mws.next,
1476 struct rpcrdma_mw, mw_list);
1477 list_del(&r->mw_list);
1478 req->rl_segments[i].mr_chunk.rl_mw = r;
1479 if (unlikely(i-- == 0))
1480 return req; /* Success */
1481 }
1482
1483 /* Not enough entries on rb_mws for this req */
1484 rpcrdma_buffer_put_sendbuf(req, buf);
1485 rpcrdma_buffer_put_mrs(req, buf);
1486 return NULL;
1243} 1487}
1244 1488
1245/* 1489/*
@@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1254struct rpcrdma_req * 1498struct rpcrdma_req *
1255rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1499rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1256{ 1500{
1501 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1502 struct list_head stale;
1257 struct rpcrdma_req *req; 1503 struct rpcrdma_req *req;
1258 unsigned long flags; 1504 unsigned long flags;
1259 int i;
1260 struct rpcrdma_mw *r;
1261 1505
1262 spin_lock_irqsave(&buffers->rb_lock, flags); 1506 spin_lock_irqsave(&buffers->rb_lock, flags);
1263 if (buffers->rb_send_index == buffers->rb_max_requests) { 1507 if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1277 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1521 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1278 } 1522 }
1279 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1523 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1280 if (!list_empty(&buffers->rb_mws)) { 1524
1281 i = RPCRDMA_MAX_SEGS - 1; 1525 INIT_LIST_HEAD(&stale);
1282 do { 1526 switch (ia->ri_memreg_strategy) {
1283 r = list_entry(buffers->rb_mws.next, 1527 case RPCRDMA_FRMR:
1284 struct rpcrdma_mw, mw_list); 1528 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1285 list_del(&r->mw_list); 1529 break;
1286 req->rl_segments[i].mr_chunk.rl_mw = r; 1530 case RPCRDMA_MTHCAFMR:
1287 } while (--i >= 0); 1531 req = rpcrdma_buffer_get_fmrs(req, buffers);
1532 break;
1533 default:
1534 break;
1288 } 1535 }
1289 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1536 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1537 if (!list_empty(&stale))
1538 rpcrdma_retry_flushed_linv(&stale, buffers);
1290 return req; 1539 return req;
1291} 1540}
1292 1541
@@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
1299{ 1548{
1300 struct rpcrdma_buffer *buffers = req->rl_buffer; 1549 struct rpcrdma_buffer *buffers = req->rl_buffer;
1301 struct rpcrdma_ia *ia = rdmab_to_ia(buffers); 1550 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1302 int i;
1303 unsigned long flags; 1551 unsigned long flags;
1304 1552
1305 spin_lock_irqsave(&buffers->rb_lock, flags); 1553 spin_lock_irqsave(&buffers->rb_lock, flags);
1306 buffers->rb_send_bufs[--buffers->rb_send_index] = req; 1554 rpcrdma_buffer_put_sendbuf(req, buffers);
1307 req->rl_niovs = 0;
1308 if (req->rl_reply) {
1309 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1310 req->rl_reply->rr_func = NULL;
1311 req->rl_reply = NULL;
1312 }
1313 switch (ia->ri_memreg_strategy) { 1555 switch (ia->ri_memreg_strategy) {
1314 case RPCRDMA_FRMR: 1556 case RPCRDMA_FRMR:
1315 case RPCRDMA_MTHCAFMR: 1557 case RPCRDMA_MTHCAFMR:
1316 /* 1558 rpcrdma_buffer_put_mrs(req, buffers);
1317 * Cycle mw's back in reverse order, and "spin" them.
1318 * This delays and scrambles reuse as much as possible.
1319 */
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break; 1559 break;
1331 default: 1560 default:
1332 break; 1561 break;
@@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1388 */ 1617 */
1389 iov->addr = ib_dma_map_single(ia->ri_id->device, 1618 iov->addr = ib_dma_map_single(ia->ri_id->device,
1390 va, len, DMA_BIDIRECTIONAL); 1619 va, len, DMA_BIDIRECTIONAL);
1620 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1621 return -ENOMEM;
1622
1391 iov->length = len; 1623 iov->length = len;
1392 1624
1393 if (ia->ri_have_dma_lkey) { 1625 if (ia->ri_have_dma_lkey) {
@@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483 struct rpcrdma_xprt *r_xprt) 1715 struct rpcrdma_xprt *r_xprt)
1484{ 1716{
1485 struct rpcrdma_mr_seg *seg1 = seg; 1717 struct rpcrdma_mr_seg *seg1 = seg;
1486 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; 1718 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
1487 1719 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1720 struct ib_mr *mr = frmr->fr_mr;
1721 struct ib_send_wr fastreg_wr, *bad_wr;
1488 u8 key; 1722 u8 key;
1489 int len, pageoff; 1723 int len, pageoff;
1490 int i, rc; 1724 int i, rc;
@@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1502 rpcrdma_map_one(ia, seg, writing); 1736 rpcrdma_map_one(ia, seg, writing);
1503 pa = seg->mr_dma; 1737 pa = seg->mr_dma;
1504 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { 1738 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1505 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl-> 1739 frmr->fr_pgl->page_list[page_no++] = pa;
1506 page_list[page_no++] = pa;
1507 pa += PAGE_SIZE; 1740 pa += PAGE_SIZE;
1508 } 1741 }
1509 len += seg->mr_len; 1742 len += seg->mr_len;
@@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1515 break; 1748 break;
1516 } 1749 }
1517 dprintk("RPC: %s: Using frmr %p to map %d segments\n", 1750 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1518 __func__, seg1->mr_chunk.rl_mw, i); 1751 __func__, mw, i);
1519 1752
1520 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { 1753 frmr->fr_state = FRMR_IS_VALID;
1521 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", 1754
1522 __func__, 1755 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1523 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); 1756 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1524 /* Invalidate before using. */ 1757 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1525 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1758 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1526 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1759 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1527 invalidate_wr.next = &frmr_wr; 1760 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1528 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1761 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1529 invalidate_wr.send_flags = IB_SEND_SIGNALED; 1762 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1530 invalidate_wr.ex.invalidate_rkey = 1763 if (fastreg_wr.wr.fast_reg.length < len) {
1531 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1764 rc = -EIO;
1532 DECR_CQCOUNT(&r_xprt->rx_ep); 1765 goto out_err;
1533 post_wr = &invalidate_wr;
1534 } else
1535 post_wr = &frmr_wr;
1536
1537 /* Prepare FRMR WR */
1538 memset(&frmr_wr, 0, sizeof frmr_wr);
1539 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1540 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1541 frmr_wr.send_flags = IB_SEND_SIGNALED;
1542 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1543 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1544 frmr_wr.wr.fast_reg.page_list_len = page_no;
1545 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1546 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1547 if (frmr_wr.wr.fast_reg.length < len) {
1548 while (seg1->mr_nsegs--)
1549 rpcrdma_unmap_one(ia, seg++);
1550 return -EIO;
1551 } 1766 }
1552 1767
1553 /* Bump the key */ 1768 /* Bump the key */
1554 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); 1769 key = (u8)(mr->rkey & 0x000000FF);
1555 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); 1770 ib_update_fast_reg_key(mr, ++key);
1556 1771
1557 frmr_wr.wr.fast_reg.access_flags = (writing ? 1772 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1558 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 1773 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1559 IB_ACCESS_REMOTE_READ); 1774 IB_ACCESS_REMOTE_READ);
1560 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1775 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1561 DECR_CQCOUNT(&r_xprt->rx_ep); 1776 DECR_CQCOUNT(&r_xprt->rx_ep);
1562 1777
1563 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); 1778 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1564
1565 if (rc) { 1779 if (rc) {
1566 dprintk("RPC: %s: failed ib_post_send for register," 1780 dprintk("RPC: %s: failed ib_post_send for register,"
1567 " status %i\n", __func__, rc); 1781 " status %i\n", __func__, rc);
1568 while (i--) 1782 ib_update_fast_reg_key(mr, --key);
1569 rpcrdma_unmap_one(ia, --seg); 1783 goto out_err;
1570 } else { 1784 } else {
1571 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1785 seg1->mr_rkey = mr->rkey;
1572 seg1->mr_base = seg1->mr_dma + pageoff; 1786 seg1->mr_base = seg1->mr_dma + pageoff;
1573 seg1->mr_nsegs = i; 1787 seg1->mr_nsegs = i;
1574 seg1->mr_len = len; 1788 seg1->mr_len = len;
1575 } 1789 }
1576 *nsegs = i; 1790 *nsegs = i;
1791 return 0;
1792out_err:
1793 frmr->fr_state = FRMR_IS_INVALID;
1794 while (i--)
1795 rpcrdma_unmap_one(ia, --seg);
1577 return rc; 1796 return rc;
1578} 1797}
1579 1798
@@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1585 struct ib_send_wr invalidate_wr, *bad_wr; 1804 struct ib_send_wr invalidate_wr, *bad_wr;
1586 int rc; 1805 int rc;
1587 1806
1588 while (seg1->mr_nsegs--) 1807 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1589 rpcrdma_unmap_one(ia, seg++);
1590 1808
1591 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1809 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1592 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1810 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1593 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1811 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1594 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1595 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1812 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1596 DECR_CQCOUNT(&r_xprt->rx_ep); 1813 DECR_CQCOUNT(&r_xprt->rx_ep);
1597 1814
1815 read_lock(&ia->ri_qplock);
1816 while (seg1->mr_nsegs--)
1817 rpcrdma_unmap_one(ia, seg++);
1598 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); 1818 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1599 if (rc) 1819 read_unlock(&ia->ri_qplock);
1820 if (rc) {
1821 /* Force rpcrdma_buffer_get() to retry */
1822 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
1600 dprintk("RPC: %s: failed ib_post_send for invalidate," 1823 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1601 " status %i\n", __func__, rc); 1824 " status %i\n", __func__, rc);
1825 }
1602 return rc; 1826 return rc;
1603} 1827}
1604 1828
@@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1656 1880
1657 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); 1881 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1658 rc = ib_unmap_fmr(&l); 1882 rc = ib_unmap_fmr(&l);
1883 read_lock(&ia->ri_qplock);
1659 while (seg1->mr_nsegs--) 1884 while (seg1->mr_nsegs--)
1660 rpcrdma_unmap_one(ia, seg++); 1885 rpcrdma_unmap_one(ia, seg++);
1886 read_unlock(&ia->ri_qplock);
1661 if (rc) 1887 if (rc)
1662 dprintk("RPC: %s: failed ib_unmap_fmr," 1888 dprintk("RPC: %s: failed ib_unmap_fmr,"
1663 " status %i\n", __func__, rc); 1889 " status %i\n", __func__, rc);
@@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1673 1899
1674 switch (ia->ri_memreg_strategy) { 1900 switch (ia->ri_memreg_strategy) {
1675 1901
1676#if RPCRDMA_PERSISTENT_REGISTRATION
1677 case RPCRDMA_ALLPHYSICAL: 1902 case RPCRDMA_ALLPHYSICAL:
1678 rpcrdma_map_one(ia, seg, writing); 1903 rpcrdma_map_one(ia, seg, writing);
1679 seg->mr_rkey = ia->ri_bind_mem->rkey; 1904 seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1681 seg->mr_nsegs = 1; 1906 seg->mr_nsegs = 1;
1682 nsegs = 1; 1907 nsegs = 1;
1683 break; 1908 break;
1684#endif
1685 1909
1686 /* Registration using frmr registration */ 1910 /* Registration using frmr registration */
1687 case RPCRDMA_FRMR: 1911 case RPCRDMA_FRMR:
@@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1711 1935
1712 switch (ia->ri_memreg_strategy) { 1936 switch (ia->ri_memreg_strategy) {
1713 1937
1714#if RPCRDMA_PERSISTENT_REGISTRATION
1715 case RPCRDMA_ALLPHYSICAL: 1938 case RPCRDMA_ALLPHYSICAL:
1939 read_lock(&ia->ri_qplock);
1716 rpcrdma_unmap_one(ia, seg); 1940 rpcrdma_unmap_one(ia, seg);
1941 read_unlock(&ia->ri_qplock);
1717 break; 1942 break;
1718#endif
1719 1943
1720 case RPCRDMA_FRMR: 1944 case RPCRDMA_FRMR:
1721 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); 1945 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
@@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1809 rc); 2033 rc);
1810 return rc; 2034 return rc;
1811} 2035}
2036
2037/* Physical mapping means one Read/Write list entry per-page.
2038 * All list entries must fit within an inline buffer
2039 *
2040 * NB: The server must return a Write list for NFS READ,
2041 * which has the same constraint. Factor in the inline
2042 * rsize as well.
2043 */
2044static size_t
2045rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2046{
2047 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2048 unsigned int inline_size, pages;
2049
2050 inline_size = min_t(unsigned int,
2051 cdata->inline_wsize, cdata->inline_rsize);
2052 inline_size -= RPCRDMA_HDRLEN_MIN;
2053 pages = inline_size / sizeof(struct rpcrdma_segment);
2054 return pages << PAGE_SHIFT;
2055}
2056
2057static size_t
2058rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2059{
2060 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2061}
2062
2063size_t
2064rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2065{
2066 size_t result;
2067
2068 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2069 case RPCRDMA_ALLPHYSICAL:
2070 result = rpcrdma_physical_max_payload(r_xprt);
2071 break;
2072 default:
2073 result = rpcrdma_mr_max_payload(r_xprt);
2074 }
2075 return result;
2076}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 89e7cd479705..c419498b8f46 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -59,6 +59,7 @@
59 * Interface Adapter -- one per transport instance 59 * Interface Adapter -- one per transport instance
60 */ 60 */
61struct rpcrdma_ia { 61struct rpcrdma_ia {
62 rwlock_t ri_qplock;
62 struct rdma_cm_id *ri_id; 63 struct rdma_cm_id *ri_id;
63 struct ib_pd *ri_pd; 64 struct ib_pd *ri_pd;
64 struct ib_mr *ri_bind_mem; 65 struct ib_mr *ri_bind_mem;
@@ -98,6 +99,14 @@ struct rpcrdma_ep {
98#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 99#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
99#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 100#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
100 101
102enum rpcrdma_chunktype {
103 rpcrdma_noch = 0,
104 rpcrdma_readch,
105 rpcrdma_areadch,
106 rpcrdma_writech,
107 rpcrdma_replych
108};
109
101/* 110/*
102 * struct rpcrdma_rep -- this structure encapsulates state required to recv 111 * struct rpcrdma_rep -- this structure encapsulates state required to recv
103 * and complete a reply, asychronously. It needs several pieces of 112 * and complete a reply, asychronously. It needs several pieces of
@@ -137,6 +146,40 @@ struct rpcrdma_rep {
137}; 146};
138 147
139/* 148/*
149 * struct rpcrdma_mw - external memory region metadata
150 *
151 * An external memory region is any buffer or page that is registered
152 * on the fly (ie, not pre-registered).
153 *
154 * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
155 * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
156 * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
157 * track of registration metadata while each RPC is pending.
158 * rpcrdma_deregister_external() uses this metadata to unmap and
159 * release these resources when an RPC is complete.
160 */
161enum rpcrdma_frmr_state {
162 FRMR_IS_INVALID, /* ready to be used */
163 FRMR_IS_VALID, /* in use */
164 FRMR_IS_STALE, /* failed completion */
165};
166
167struct rpcrdma_frmr {
168 struct ib_fast_reg_page_list *fr_pgl;
169 struct ib_mr *fr_mr;
170 enum rpcrdma_frmr_state fr_state;
171};
172
173struct rpcrdma_mw {
174 union {
175 struct ib_fmr *fmr;
176 struct rpcrdma_frmr frmr;
177 } r;
178 struct list_head mw_list;
179 struct list_head mw_all;
180};
181
182/*
140 * struct rpcrdma_req -- structure central to the request/reply sequence. 183 * struct rpcrdma_req -- structure central to the request/reply sequence.
141 * 184 *
142 * N of these are associated with a transport instance, and stored in 185 * N of these are associated with a transport instance, and stored in
@@ -163,17 +206,7 @@ struct rpcrdma_rep {
163struct rpcrdma_mr_seg { /* chunk descriptors */ 206struct rpcrdma_mr_seg { /* chunk descriptors */
164 union { /* chunk memory handles */ 207 union { /* chunk memory handles */
165 struct ib_mr *rl_mr; /* if registered directly */ 208 struct ib_mr *rl_mr; /* if registered directly */
166 struct rpcrdma_mw { /* if registered from region */ 209 struct rpcrdma_mw *rl_mw; /* if registered from region */
167 union {
168 struct ib_fmr *fmr;
169 struct {
170 struct ib_fast_reg_page_list *fr_pgl;
171 struct ib_mr *fr_mr;
172 enum { FRMR_IS_INVALID, FRMR_IS_VALID } state;
173 } frmr;
174 } r;
175 struct list_head mw_list;
176 } *rl_mw;
177 } mr_chunk; 210 } mr_chunk;
178 u64 mr_base; /* registration result */ 211 u64 mr_base; /* registration result */
179 u32 mr_rkey; /* registration result */ 212 u32 mr_rkey; /* registration result */
@@ -191,6 +224,7 @@ struct rpcrdma_req {
191 unsigned int rl_niovs; /* 0, 2 or 4 */ 224 unsigned int rl_niovs; /* 0, 2 or 4 */
192 unsigned int rl_nchunks; /* non-zero if chunks */ 225 unsigned int rl_nchunks; /* non-zero if chunks */
193 unsigned int rl_connect_cookie; /* retry detection */ 226 unsigned int rl_connect_cookie; /* retry detection */
227 enum rpcrdma_chunktype rl_rtype, rl_wtype;
194 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 228 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
195 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 229 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
196 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ 230 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -214,6 +248,7 @@ struct rpcrdma_buffer {
214 atomic_t rb_credits; /* most recent server credits */ 248 atomic_t rb_credits; /* most recent server credits */
215 int rb_max_requests;/* client max requests */ 249 int rb_max_requests;/* client max requests */
216 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ 250 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
251 struct list_head rb_all;
217 int rb_send_index; 252 int rb_send_index;
218 struct rpcrdma_req **rb_send_bufs; 253 struct rpcrdma_req **rb_send_bufs;
219 int rb_recv_index; 254 int rb_recv_index;
@@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
306 struct rpcrdma_create_data_internal *); 341 struct rpcrdma_create_data_internal *);
307void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); 342void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
308int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); 343int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
309int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); 344void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
310 345
311int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, 346int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
312 struct rpcrdma_req *); 347 struct rpcrdma_req *);
@@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
346/* 381/*
347 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 382 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
348 */ 383 */
384ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
349int rpcrdma_marshal_req(struct rpc_rqst *); 385int rpcrdma_marshal_req(struct rpc_rqst *);
386size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
350 387
351/* Temporary NFS request map cache. Created in svc_rdma.c */ 388/* Temporary NFS request map cache. Created in svc_rdma.c */
352extern struct kmem_cache *svc_rdma_map_cachep; 389extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index be8bbd5d65ec..43cd89eacfab 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task)
594 } 594 }
595 595
596 switch (status) { 596 switch (status) {
597 case -ENOBUFS:
597 case -EAGAIN: 598 case -EAGAIN:
598 status = xs_nospace(task); 599 status = xs_nospace(task);
599 break; 600 break;
@@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task)
661 dprintk("RPC: sendmsg returned unrecognized error %d\n", 662 dprintk("RPC: sendmsg returned unrecognized error %d\n",
662 -status); 663 -status);
663 case -ENETUNREACH: 664 case -ENETUNREACH:
665 case -ENOBUFS:
664 case -EPIPE: 666 case -EPIPE:
665 case -ECONNREFUSED: 667 case -ECONNREFUSED:
666 /* When the server has died, an ICMP port unreachable message 668 /* When the server has died, an ICMP port unreachable message
@@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
758 status = -ENOTCONN; 760 status = -ENOTCONN;
759 /* Should we call xs_close() here? */ 761 /* Should we call xs_close() here? */
760 break; 762 break;
763 case -ENOBUFS:
761 case -EAGAIN: 764 case -EAGAIN:
762 status = xs_nospace(task); 765 status = xs_nospace(task);
763 break; 766 break;
@@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
1946 dprintk("RPC: xprt %p connected to %s\n", 1949 dprintk("RPC: xprt %p connected to %s\n",
1947 xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); 1950 xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
1948 xprt_set_connected(xprt); 1951 xprt_set_connected(xprt);
1952 case -ENOBUFS:
1949 break; 1953 break;
1950 case -ENOENT: 1954 case -ENOENT:
1951 dprintk("RPC: xprt %p: socket %s does not exist\n", 1955 dprintk("RPC: xprt %p: socket %s does not exist\n",
@@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2281 case -ECONNREFUSED: 2285 case -ECONNREFUSED:
2282 case -ECONNRESET: 2286 case -ECONNRESET:
2283 case -ENETUNREACH: 2287 case -ENETUNREACH:
2288 case -ENOBUFS:
2284 /* retry with existing socket, after a delay */ 2289 /* retry with existing socket, after a delay */
2285 goto out; 2290 goto out;
2286 } 2291 }
@@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val,
3054 const struct kernel_param *kp, 3059 const struct kernel_param *kp,
3055 unsigned int min, unsigned int max) 3060 unsigned int min, unsigned int max)
3056{ 3061{
3057 unsigned long num; 3062 unsigned int num;
3058 int ret; 3063 int ret;
3059 3064
3060 if (!val) 3065 if (!val)
3061 return -EINVAL; 3066 return -EINVAL;
3062 ret = strict_strtoul(val, 0, &num); 3067 ret = kstrtouint(val, 0, &num);
3063 if (ret == -EINVAL || num < min || num > max) 3068 if (ret == -EINVAL || num < min || num > max)
3064 return -EINVAL; 3069 return -EINVAL;
3065 *((unsigned int *)kp->arg) = num; 3070 *((unsigned int *)kp->arg) = num;