aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/pnfs.c
diff options
context:
space:
mode:
authorAndy Adamson <andros@netapp.com>2010-10-20 00:18:03 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2010-10-24 18:07:10 -0400
commitb1f69b754ee312ec75f2c7ead0e6851cd9598cc2 (patch)
tree1d8e70abb2cd087e3b97f73d86db8b9568467378 /fs/nfs/pnfs.c
parent974cec8ca0352eb5d281535b714cf194a606e98f (diff)
NFSv4.1: pnfs: add LAYOUTGET and GETDEVICEINFO infrastructure
Add the ability to actually send LAYOUTGET and GETDEVICEINFO. This also adds in the machinery to handle layout state and the deviceid cache. Note that GETDEVICEINFO is not called directly by the generic layer. Instead it is called by the drivers while parsing the LAYOUTGET opaque data in response to an unknown device id embedded therein. RFC 5661 only encodes device ids within the driver-specific opaque data. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Dean Hildebrand <dhildebz@umich.edu> Signed-off-by: Marc Eshel <eshel@almaden.ibm.com> Signed-off-by: Mike Sager <sager@netapp.com> Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com> Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn> Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> Signed-off-by: Fred Isaman <iisaman@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs/pnfs.c')
-rw-r--r--fs/nfs/pnfs.c385
1 files changed, 353 insertions, 32 deletions
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 891a0c36f992..d1ad7df3479e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -140,6 +140,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
140 printk(KERN_ERR "%s id 0 is reserved\n", __func__); 140 printk(KERN_ERR "%s id 0 is reserved\n", __func__);
141 return status; 141 return status;
142 } 142 }
143 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
144 printk(KERN_ERR "%s Layout driver must provide "
145 "alloc_lseg and free_lseg.\n", __func__);
146 return status;
147 }
143 148
144 spin_lock(&pnfs_spinlock); 149 spin_lock(&pnfs_spinlock);
145 tmp = find_pnfs_driver_locked(ld_type->id); 150 tmp = find_pnfs_driver_locked(ld_type->id);
@@ -168,6 +173,10 @@ pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
168} 173}
169EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 174EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
170 175
176/*
177 * pNFS client layout cache
178 */
179
171static void 180static void
172get_layout_hdr_locked(struct pnfs_layout_hdr *lo) 181get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
173{ 182{
@@ -190,7 +199,7 @@ put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
190 } 199 }
191} 200}
192 201
193static void 202void
194put_layout_hdr(struct inode *inode) 203put_layout_hdr(struct inode *inode)
195{ 204{
196 spin_lock(&inode->i_lock); 205 spin_lock(&inode->i_lock);
@@ -215,7 +224,7 @@ destroy_lseg(struct kref *kref)
215 struct inode *ino = lseg->layout->inode; 224 struct inode *ino = lseg->layout->inode;
216 225
217 dprintk("--> %s\n", __func__); 226 dprintk("--> %s\n", __func__);
218 kfree(lseg); 227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
219 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ 228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
220 put_layout_hdr(ino); 229 put_layout_hdr(ino);
221} 230}
@@ -249,6 +258,9 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
249 /* List does not take a reference, so no need for put here */ 258 /* List does not take a reference, so no need for put here */
250 list_del_init(&lo->layouts); 259 list_del_init(&lo->layouts);
251 spin_unlock(&clp->cl_lock); 260 spin_unlock(&clp->cl_lock);
261 write_seqlock(&lo->seqlock);
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
252 264
253 dprintk("%s:Return\n", __func__); 265 dprintk("%s:Return\n", __func__);
254} 266}
@@ -307,40 +319,135 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
307 } 319 }
308} 320}
309 321
310static void pnfs_insert_layout(struct pnfs_layout_hdr *lo, 322/* update lo->stateid with new if is more recent
311 struct pnfs_layout_segment *lseg); 323 *
324 * lo->stateid could be the open stateid, in which case we just use what given.
325 */
326static void
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
328 const nfs4_stateid *new)
329{
330 nfs4_stateid *old = &lo->stateid;
331 bool overwrite = false;
332
333 write_seqlock(&lo->seqlock);
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
336 overwrite = true;
337 else {
338 u32 oldseq, newseq;
339
340 oldseq = be32_to_cpu(old->stateid.seqid);
341 newseq = be32_to_cpu(new->stateid.seqid);
342 if ((int)(newseq - oldseq) > 0)
343 overwrite = true;
344 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348}
349
350static void
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
352 struct nfs4_state *state)
353{
354 int seq;
355
356 dprintk("--> %s\n", __func__);
357 write_seqlock(&lo->seqlock);
358 do {
359 seq = read_seqbegin(&state->seqlock);
360 memcpy(lo->stateid.data, state->stateid.data,
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366}
367
368void
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state)
371{
372 int seq;
312 373
313/* Get layout from server. */ 374 dprintk("--> %s\n", __func__);
375 do {
376 seq = read_seqbegin(&lo->seqlock);
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
378 /* This will trigger retry of the read */
379 pnfs_layout_from_open_stateid(lo, open_state);
380 } else
381 memcpy(dst->data, lo->stateid.data,
382 sizeof(lo->stateid.data));
383 } while (read_seqretry(&lo->seqlock, seq));
384 dprintk("<-- %s\n", __func__);
385}
386
387/*
388* Get layout from server.
389* for now, assume that whole file layouts are requested.
390* arg->offset: 0
391* arg->length: all ones
392*/
314static struct pnfs_layout_segment * 393static struct pnfs_layout_segment *
315send_layoutget(struct pnfs_layout_hdr *lo, 394send_layoutget(struct pnfs_layout_hdr *lo,
316 struct nfs_open_context *ctx, 395 struct nfs_open_context *ctx,
317 u32 iomode) 396 u32 iomode)
318{ 397{
319 struct inode *ino = lo->inode; 398 struct inode *ino = lo->inode;
320 struct pnfs_layout_segment *lseg; 399 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL;
402
403 dprintk("--> %s\n", __func__);
321 404
322 /* Lets pretend we sent LAYOUTGET and got a response */ 405 BUG_ON(ctx == NULL);
323 lseg = kzalloc(sizeof(*lseg), GFP_KERNEL); 406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) {
408 put_layout_hdr(lo->inode);
409 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode;
414 lgp->args.range.offset = 0;
415 lgp->args.range.length = NFS4_MAX_UINT64;
416 lgp->args.type = server->pnfs_curr_ld->id;
417 lgp->args.inode = ino;
418 lgp->args.ctx = get_nfs_open_context(ctx);
419 lgp->lsegpp = &lseg;
420
421 /* Synchronously retrieve layout information from server and
422 * store in lseg.
423 */
424 nfs4_proc_layoutget(lgp);
324 if (!lseg) { 425 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */
325 set_bit(lo_fail_bit(iomode), &lo->state); 427 set_bit(lo_fail_bit(iomode), &lo->state);
326 spin_lock(&ino->i_lock);
327 put_layout_hdr_locked(lo);
328 spin_unlock(&ino->i_lock);
329 return NULL;
330 } 428 }
331 init_lseg(lo, lseg);
332 lseg->iomode = IOMODE_RW;
333 spin_lock(&ino->i_lock);
334 pnfs_insert_layout(lo, lseg);
335 put_layout_hdr_locked(lo);
336 spin_unlock(&ino->i_lock);
337 return lseg; 429 return lseg;
338} 430}
339 431
432/*
433 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those
435 * are seen first.
436 */
437static s64
438cmp_layout(u32 iomode1, u32 iomode2)
439{
440 /* read > read/write */
441 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
442}
443
340static void 444static void
341pnfs_insert_layout(struct pnfs_layout_hdr *lo, 445pnfs_insert_layout(struct pnfs_layout_hdr *lo,
342 struct pnfs_layout_segment *lseg) 446 struct pnfs_layout_segment *lseg)
343{ 447{
448 struct pnfs_layout_segment *lp;
449 int found = 0;
450
344 dprintk("%s:Begin\n", __func__); 451 dprintk("%s:Begin\n", __func__);
345 452
346 assert_spin_locked(&lo->inode->i_lock); 453 assert_spin_locked(&lo->inode->i_lock);
@@ -352,19 +459,28 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
352 list_add_tail(&lo->layouts, &clp->cl_layouts); 459 list_add_tail(&lo->layouts, &clp->cl_layouts);
353 spin_unlock(&clp->cl_lock); 460 spin_unlock(&clp->cl_lock);
354 } 461 }
355 get_layout_hdr_locked(lo); 462 list_for_each_entry(lp, &lo->segs, fi_list) {
356 /* STUB - add the constructed lseg if necessary */ 463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
357 if (list_empty(&lo->segs)) { 464 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list);
466 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode,
470 lseg->range.offset, lseg->range.length,
471 lp, lp->range.iomode, lp->range.offset,
472 lp->range.length);
473 found = 1;
474 break;
475 }
476 if (!found) {
358 list_add_tail(&lseg->fi_list, &lo->segs); 477 list_add_tail(&lseg->fi_list, &lo->segs);
359 dprintk("%s: inserted lseg %p iomode %d at tail\n", 478 dprintk("%s: inserted lseg %p "
360 __func__, lseg, lseg->iomode); 479 "iomode %d offset %llu length %llu at tail\n",
361 } else { 480 __func__, lseg, lseg->range.iomode,
362 /* There is no harm for the moment in calling this 481 lseg->range.offset, lseg->range.length);
363 * with the lock held, and the call will be removed
364 * with the STUB.
365 */
366 put_lseg(lseg);
367 } 482 }
483 get_layout_hdr_locked(lo);
368 484
369 dprintk("%s:Return\n", __func__); 485 dprintk("%s:Return\n", __func__);
370} 486}
@@ -380,6 +496,7 @@ alloc_init_layout_hdr(struct inode *ino)
380 lo->refcount = 1; 496 lo->refcount = 1;
381 INIT_LIST_HEAD(&lo->layouts); 497 INIT_LIST_HEAD(&lo->layouts);
382 INIT_LIST_HEAD(&lo->segs); 498 INIT_LIST_HEAD(&lo->segs);
499 seqlock_init(&lo->seqlock);
383 lo->inode = ino; 500 lo->inode = ino;
384 return lo; 501 return lo;
385} 502}
@@ -407,11 +524,46 @@ pnfs_find_alloc_layout(struct inode *ino)
407 return nfsi->layout; 524 return nfsi->layout;
408} 525}
409 526
410/* STUB - LAYOUTGET never succeeds, so cache is empty */ 527/*
528 * iomode matching rules:
529 * iomode lseg match
530 * ----- ----- -----
531 * ANY READ true
532 * ANY RW true
533 * RW READ false
534 * RW RW true
535 * READ READ true
536 * READ RW true
537 */
538static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
542}
543
544/*
545 * lookup range in layout
546 */
411static struct pnfs_layout_segment * 547static struct pnfs_layout_segment *
412pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) 548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
413{ 549{
414 return NULL; 550 struct pnfs_layout_segment *lseg, *ret = NULL;
551
552 dprintk("%s:Begin\n", __func__);
553
554 assert_spin_locked(&lo->inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) {
556 if (is_matching_lseg(lseg, iomode)) {
557 ret = lseg;
558 break;
559 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0)
561 break;
562 }
563
564 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
566 return ret;
415} 567}
416 568
417/* 569/*
@@ -448,7 +600,7 @@ pnfs_update_layout(struct inode *ino,
448 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) 600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
449 goto out_unlock; 601 goto out_unlock;
450 602
451 get_layout_hdr_locked(lo); 603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
452 spin_unlock(&ino->i_lock); 604 spin_unlock(&ino->i_lock);
453 605
454 lseg = send_layoutget(lo, ctx, iomode); 606 lseg = send_layoutget(lo, ctx, iomode);
@@ -460,3 +612,172 @@ out_unlock:
460 spin_unlock(&ino->i_lock); 612 spin_unlock(&ino->i_lock);
461 goto out; 613 goto out;
462} 614}
615
616int
617pnfs_layout_process(struct nfs4_layoutget *lgp)
618{
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode;
623 int status = 0;
624
625 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) {
628 if (!lseg)
629 status = -ENOMEM;
630 else
631 status = PTR_ERR(lseg);
632 dprintk("%s: Could not allocate layout: error %d\n",
633 __func__, status);
634 goto out;
635 }
636
637 spin_lock(&ino->i_lock);
638 init_lseg(lo, lseg);
639 lseg->range = res->range;
640 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg);
642
643 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid);
645 spin_unlock(&ino->i_lock);
646out:
647 return status;
648}
649
650/*
651 * Device ID cache. Currently supports one layout type per struct nfs_client.
652 * Add layout type to the lookup key to expand to support multiple types.
653 */
654int
655pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
656 void (*free_callback)(struct pnfs_deviceid_node *))
657{
658 struct pnfs_deviceid_cache *c;
659
660 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
661 if (!c)
662 return -ENOMEM;
663 spin_lock(&clp->cl_lock);
664 if (clp->cl_devid_cache != NULL) {
665 atomic_inc(&clp->cl_devid_cache->dc_ref);
666 dprintk("%s [kref [%d]]\n", __func__,
667 atomic_read(&clp->cl_devid_cache->dc_ref));
668 kfree(c);
669 } else {
670 /* kzalloc initializes hlists */
671 spin_lock_init(&c->dc_lock);
672 atomic_set(&c->dc_ref, 1);
673 c->dc_free_callback = free_callback;
674 clp->cl_devid_cache = c;
675 dprintk("%s [new]\n", __func__);
676 }
677 spin_unlock(&clp->cl_lock);
678 return 0;
679}
680EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
681
682/*
683 * Called from pnfs_layoutdriver_type->free_lseg
684 * last layout segment reference frees deviceid
685 */
686void
687pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
688 struct pnfs_deviceid_node *devid)
689{
690 struct nfs4_deviceid *id = &devid->de_id;
691 struct pnfs_deviceid_node *d;
692 struct hlist_node *n;
693 long h = nfs4_deviceid_hash(id);
694
695 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
696 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
697 return;
698
699 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
700 if (!memcmp(&d->de_id, id, sizeof(*id))) {
701 hlist_del_rcu(&d->de_node);
702 spin_unlock(&c->dc_lock);
703 synchronize_rcu();
704 c->dc_free_callback(devid);
705 return;
706 }
707 spin_unlock(&c->dc_lock);
708 /* Why wasn't it found in the list? */
709 BUG();
710}
711EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
712
713/* Find and reference a deviceid */
714struct pnfs_deviceid_node *
715pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
716{
717 struct pnfs_deviceid_node *d;
718 struct hlist_node *n;
719 long hash = nfs4_deviceid_hash(id);
720
721 dprintk("--> %s hash %ld\n", __func__, hash);
722 rcu_read_lock();
723 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
724 if (!memcmp(&d->de_id, id, sizeof(*id))) {
725 if (!atomic_inc_not_zero(&d->de_ref)) {
726 goto fail;
727 } else {
728 rcu_read_unlock();
729 return d;
730 }
731 }
732 }
733fail:
734 rcu_read_unlock();
735 return NULL;
736}
737EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
738
739/*
740 * Add a deviceid to the cache.
741 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
742 */
743struct pnfs_deviceid_node *
744pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
745{
746 struct pnfs_deviceid_node *d;
747 long hash = nfs4_deviceid_hash(&new->de_id);
748
749 dprintk("--> %s hash %ld\n", __func__, hash);
750 spin_lock(&c->dc_lock);
751 d = pnfs_find_get_deviceid(c, &new->de_id);
752 if (d) {
753 spin_unlock(&c->dc_lock);
754 dprintk("%s [discard]\n", __func__);
755 c->dc_free_callback(new);
756 return d;
757 }
758 INIT_HLIST_NODE(&new->de_node);
759 atomic_set(&new->de_ref, 1);
760 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
761 spin_unlock(&c->dc_lock);
762 dprintk("%s [new]\n", __func__);
763 return new;
764}
765EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
766
767void
768pnfs_put_deviceid_cache(struct nfs_client *clp)
769{
770 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
771
772 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
773 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
774 int i;
775 /* Verify cache is empty */
776 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
777 BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
778 clp->cl_devid_cache = NULL;
779 spin_unlock(&clp->cl_lock);
780 kfree(local);
781 }
782}
783EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);