aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/pnfs.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-01-11 18:11:56 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-11 18:11:56 -0500
commitb9d919a4ac6cf031b8e065f82ad8f1b0c9ed74b1 (patch)
tree3139b066396956fd3794df0cb1aa74dcc9f1cb28 /fs/nfs/pnfs.c
parent7c955fca3e1d8132982148267d9efcafae849bb6 (diff)
parent357f54d6b38252737116a6d631f6ac28ded018ed (diff)
Merge branch 'nfs-for-2.6.38' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
* 'nfs-for-2.6.38' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6: (89 commits) NFS fix the setting of exchange id flag NFS: Don't use vm_map_ram() in readdir NFSv4: Ensure continued open and lockowner name uniqueness NFS: Move cl_delegations to the nfs_server struct NFS: Introduce nfs_detach_delegations() NFS: Move cl_state_owners and related fields to the nfs_server struct NFS: Allow walking nfs_client.cl_superblocks list outside client.c pnfs: layout roc code pnfs: update nfs4_callback_recallany to handle layouts pnfs: add CB_LAYOUTRECALL handling pnfs: CB_LAYOUTRECALL xdr code pnfs: change lo refcounting to atomic_t pnfs: check that partial LAYOUTGET return is ignored pnfs: add layout to client list before sending rpc pnfs: serialize LAYOUTGET(openstateid) pnfs: layoutget rpc code cleanup pnfs: change how lsegs are removed from layout list pnfs: change layout state seqlock to a spinlock pnfs: add prefix to struct pnfs_layout_hdr fields pnfs: add prefix to struct pnfs_layout_segment fields ...
Diffstat (limited to 'fs/nfs/pnfs.c')
-rw-r--r--fs/nfs/pnfs.c524
1 files changed, 353 insertions, 171 deletions
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95f..bc4089769735 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
177 * pNFS client layout cache 177 * pNFS client layout cache
178 */ 178 */
179 179
180/* Need to hold i_lock if caller does not already hold reference */
181void
182get_layout_hdr(struct pnfs_layout_hdr *lo)
183{
184 atomic_inc(&lo->plh_refcount);
185}
186
180static void 187static void
181get_layout_hdr_locked(struct pnfs_layout_hdr *lo) 188destroy_layout_hdr(struct pnfs_layout_hdr *lo)
182{ 189{
183 assert_spin_locked(&lo->inode->i_lock); 190 dprintk("%s: freeing layout cache %p\n", __func__, lo);
184 lo->refcount++; 191 BUG_ON(!list_empty(&lo->plh_layouts));
192 NFS_I(lo->plh_inode)->layout = NULL;
193 kfree(lo);
185} 194}
186 195
187static void 196static void
188put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 197put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
189{ 198{
190 assert_spin_locked(&lo->inode->i_lock); 199 if (atomic_dec_and_test(&lo->plh_refcount))
191 BUG_ON(lo->refcount == 0); 200 destroy_layout_hdr(lo);
192
193 lo->refcount--;
194 if (!lo->refcount) {
195 dprintk("%s: freeing layout cache %p\n", __func__, lo);
196 BUG_ON(!list_empty(&lo->layouts));
197 NFS_I(lo->inode)->layout = NULL;
198 kfree(lo);
199 }
200} 201}
201 202
202void 203void
203put_layout_hdr(struct inode *inode) 204put_layout_hdr(struct pnfs_layout_hdr *lo)
204{ 205{
205 spin_lock(&inode->i_lock); 206 struct inode *inode = lo->plh_inode;
206 put_layout_hdr_locked(NFS_I(inode)->layout); 207
207 spin_unlock(&inode->i_lock); 208 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
209 destroy_layout_hdr(lo);
210 spin_unlock(&inode->i_lock);
211 }
208} 212}
209 213
210static void 214static void
211init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 215init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
212{ 216{
213 INIT_LIST_HEAD(&lseg->fi_list); 217 INIT_LIST_HEAD(&lseg->pls_list);
214 kref_init(&lseg->kref); 218 atomic_set(&lseg->pls_refcount, 1);
215 lseg->layout = lo; 219 smp_mb();
220 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
221 lseg->pls_layout = lo;
216} 222}
217 223
218/* Called without i_lock held, as the free_lseg call may sleep */ 224static void free_lseg(struct pnfs_layout_segment *lseg)
219static void
220destroy_lseg(struct kref *kref)
221{ 225{
222 struct pnfs_layout_segment *lseg = 226 struct inode *ino = lseg->pls_layout->plh_inode;
223 container_of(kref, struct pnfs_layout_segment, kref);
224 struct inode *ino = lseg->layout->inode;
225 227
226 dprintk("--> %s\n", __func__);
227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 228 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ 229 /* Matched by get_layout_hdr in pnfs_insert_layout */
229 put_layout_hdr(ino); 230 put_layout_hdr(NFS_I(ino)->layout);
230} 231}
231 232
232static void 233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
233put_lseg(struct pnfs_layout_segment *lseg) 234 * could sleep, so must be called outside of the lock.
235 * Returns 1 if object was removed, otherwise return 0.
236 */
237static int
238put_lseg_locked(struct pnfs_layout_segment *lseg,
239 struct list_head *tmp_list)
240{
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) {
245 struct inode *ino = lseg->pls_layout->plh_inode;
246
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
248 list_del(&lseg->pls_list);
249 if (list_empty(&lseg->pls_layout->plh_segs)) {
250 struct nfs_client *clp;
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 }
263 return 0;
264}
265
266static bool
267should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
234{ 268{
235 if (!lseg) 269 return (recall_iomode == IOMODE_ANY ||
236 return; 270 lseg_iomode == recall_iomode);
271}
237 272
238 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 273/* Returns 1 if lseg is removed from list, 0 otherwise */
239 atomic_read(&lseg->kref.refcount)); 274static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
240 kref_put(&lseg->kref, destroy_lseg); 275 struct list_head *tmp_list)
276{
277 int rv = 0;
278
279 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
280 /* Remove the reference keeping the lseg in the
281 * list. It will now be removed when all
282 * outstanding io is finished.
283 */
284 rv = put_lseg_locked(lseg, tmp_list);
285 }
286 return rv;
241} 287}
242 288
243static void 289/* Returns count of number of matching invalid lsegs remaining in list
244pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) 290 * after call.
291 */
292int
293mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
294 struct list_head *tmp_list,
295 u32 iomode)
245{ 296{
246 struct pnfs_layout_segment *lseg, *next; 297 struct pnfs_layout_segment *lseg, *next;
247 struct nfs_client *clp; 298 int invalid = 0, removed = 0;
248 299
249 dprintk("%s:Begin lo %p\n", __func__, lo); 300 dprintk("%s:Begin lo %p\n", __func__, lo);
250 301
251 assert_spin_locked(&lo->inode->i_lock); 302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
252 list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { 303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
253 dprintk("%s: freeing lseg %p\n", __func__, lseg); 304 dprintk("%s: freeing lseg %p iomode %d "
254 list_move(&lseg->fi_list, tmp_list); 305 "offset %llu length %llu\n", __func__,
255 } 306 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
256 clp = NFS_SERVER(lo->inode)->nfs_client; 307 lseg->pls_range.length);
257 spin_lock(&clp->cl_lock); 308 invalid++;
258 /* List does not take a reference, so no need for put here */ 309 removed += mark_lseg_invalid(lseg, tmp_list);
259 list_del_init(&lo->layouts); 310 }
260 spin_unlock(&clp->cl_lock); 311 dprintk("%s:Return %i\n", __func__, invalid - removed);
261 write_seqlock(&lo->seqlock); 312 return invalid - removed;
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
264
265 dprintk("%s:Return\n", __func__);
266} 313}
267 314
268static void 315void
269pnfs_free_lseg_list(struct list_head *tmp_list) 316pnfs_free_lseg_list(struct list_head *free_me)
270{ 317{
271 struct pnfs_layout_segment *lseg; 318 struct pnfs_layout_segment *lseg, *tmp;
272 319
273 while (!list_empty(tmp_list)) { 320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
274 lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, 321 list_del(&lseg->pls_list);
275 fi_list); 322 free_lseg(lseg);
276 dprintk("%s calling put_lseg on %p\n", __func__, lseg);
277 list_del(&lseg->fi_list);
278 put_lseg(lseg);
279 } 323 }
280} 324}
281 325
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
288 spin_lock(&nfsi->vfs_inode.i_lock); 332 spin_lock(&nfsi->vfs_inode.i_lock);
289 lo = nfsi->layout; 333 lo = nfsi->layout;
290 if (lo) { 334 if (lo) {
291 pnfs_clear_lseg_list(lo, &tmp_list); 335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
292 /* Matched by refcount set to 1 in alloc_init_layout_hdr */ 337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
293 put_layout_hdr_locked(lo); 338 put_layout_hdr_locked(lo);
294 } 339 }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
312 357
313 while (!list_empty(&tmp_list)) { 358 while (!list_empty(&tmp_list)) {
314 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, 359 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
315 layouts); 360 plh_layouts);
316 dprintk("%s freeing layout for inode %lu\n", __func__, 361 dprintk("%s freeing layout for inode %lu\n", __func__,
317 lo->inode->i_ino); 362 lo->plh_inode->i_ino);
318 pnfs_destroy_layout(NFS_I(lo->inode)); 363 pnfs_destroy_layout(NFS_I(lo->plh_inode));
319 } 364 }
320} 365}
321 366
322/* update lo->stateid with new if is more recent 367/* update lo->plh_stateid with new if is more recent */
323 * 368void
324 * lo->stateid could be the open stateid, in which case we just use what given. 369pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
325 */ 370 bool update_barrier)
326static void 371{
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 372 u32 oldseq, newseq;
328 const nfs4_stateid *new) 373
329{ 374 oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
330 nfs4_stateid *old = &lo->stateid; 375 newseq = be32_to_cpu(new->stateid.seqid);
331 bool overwrite = false; 376 if ((int)(newseq - oldseq) > 0) {
332 377 memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
333 write_seqlock(&lo->seqlock); 378 if (update_barrier) {
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || 379 u32 new_barrier = be32_to_cpu(new->stateid.seqid);
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) 380
336 overwrite = true; 381 if ((int)(new_barrier - lo->plh_barrier))
337 else { 382 lo->plh_barrier = new_barrier;
338 u32 oldseq, newseq; 383 } else {
339 384 /* Because of wraparound, we want to keep the barrier
340 oldseq = be32_to_cpu(old->stateid.seqid); 385 * "close" to the current seqids. It needs to be
341 newseq = be32_to_cpu(new->stateid.seqid); 386 * within 2**31 to count as "behind", so if it
342 if ((int)(newseq - oldseq) > 0) 387 * gets too near that limit, give us a litle leeway
343 overwrite = true; 388 * and bring it to within 2**30.
389 * NOTE - and yes, this is all unsigned arithmetic.
390 */
391 if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
392 lo->plh_barrier = newseq - (1 << 30);
393 }
344 } 394 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348} 395}
349 396
350static void 397/* lget is set to 1 if called from inside send_layoutget call chain */
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, 398static bool
352 struct nfs4_state *state) 399pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
400 int lget)
353{ 401{
354 int seq; 402 if ((stateid) &&
355 403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
356 dprintk("--> %s\n", __func__); 404 return true;
357 write_seqlock(&lo->seqlock); 405 return lo->plh_block_lgets ||
358 do { 406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
359 seq = read_seqbegin(&state->seqlock); 407 (list_empty(&lo->plh_segs) &&
360 memcpy(lo->stateid.data, state->stateid.data, 408 (atomic_read(&lo->plh_outstanding) > lget));
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366} 409}
367 410
368void 411int
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 412pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state) 413 struct nfs4_state *open_state)
371{ 414{
372 int seq; 415 int status = 0;
373 416
374 dprintk("--> %s\n", __func__); 417 dprintk("--> %s\n", __func__);
375 do { 418 spin_lock(&lo->plh_inode->i_lock);
376 seq = read_seqbegin(&lo->seqlock); 419 if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { 420 status = -EAGAIN;
378 /* This will trigger retry of the read */ 421 } else if (list_empty(&lo->plh_segs)) {
379 pnfs_layout_from_open_stateid(lo, open_state); 422 int seq;
380 } else 423
381 memcpy(dst->data, lo->stateid.data, 424 do {
382 sizeof(lo->stateid.data)); 425 seq = read_seqbegin(&open_state->seqlock);
383 } while (read_seqretry(&lo->seqlock, seq)); 426 memcpy(dst->data, open_state->stateid.data,
427 sizeof(open_state->stateid.data));
428 } while (read_seqretry(&open_state->seqlock, seq));
429 } else
430 memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
431 spin_unlock(&lo->plh_inode->i_lock);
384 dprintk("<-- %s\n", __func__); 432 dprintk("<-- %s\n", __func__);
433 return status;
385} 434}
386 435
387/* 436/*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
395 struct nfs_open_context *ctx, 444 struct nfs_open_context *ctx,
396 u32 iomode) 445 u32 iomode)
397{ 446{
398 struct inode *ino = lo->inode; 447 struct inode *ino = lo->plh_inode;
399 struct nfs_server *server = NFS_SERVER(ino); 448 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp; 449 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL; 450 struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
404 453
405 BUG_ON(ctx == NULL); 454 BUG_ON(ctx == NULL);
406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); 455 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) { 456 if (lgp == NULL)
408 put_layout_hdr(lo->inode);
409 return NULL; 457 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64; 458 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 459 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode; 460 lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
424 nfs4_proc_layoutget(lgp); 471 nfs4_proc_layoutget(lgp);
425 if (!lseg) { 472 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */ 473 /* remember that LAYOUTGET failed and suspend trying */
427 set_bit(lo_fail_bit(iomode), &lo->state); 474 set_bit(lo_fail_bit(iomode), &lo->plh_flags);
428 } 475 }
429 return lseg; 476 return lseg;
430} 477}
431 478
479bool pnfs_roc(struct inode *ino)
480{
481 struct pnfs_layout_hdr *lo;
482 struct pnfs_layout_segment *lseg, *tmp;
483 LIST_HEAD(tmp_list);
484 bool found = false;
485
486 spin_lock(&ino->i_lock);
487 lo = NFS_I(ino)->layout;
488 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
489 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
490 goto out_nolayout;
491 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
492 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
493 mark_lseg_invalid(lseg, &tmp_list);
494 found = true;
495 }
496 if (!found)
497 goto out_nolayout;
498 lo->plh_block_lgets++;
499 get_layout_hdr(lo); /* matched in pnfs_roc_release */
500 spin_unlock(&ino->i_lock);
501 pnfs_free_lseg_list(&tmp_list);
502 return true;
503
504out_nolayout:
505 spin_unlock(&ino->i_lock);
506 return false;
507}
508
509void pnfs_roc_release(struct inode *ino)
510{
511 struct pnfs_layout_hdr *lo;
512
513 spin_lock(&ino->i_lock);
514 lo = NFS_I(ino)->layout;
515 lo->plh_block_lgets--;
516 put_layout_hdr_locked(lo);
517 spin_unlock(&ino->i_lock);
518}
519
520void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
521{
522 struct pnfs_layout_hdr *lo;
523
524 spin_lock(&ino->i_lock);
525 lo = NFS_I(ino)->layout;
526 if ((int)(barrier - lo->plh_barrier) > 0)
527 lo->plh_barrier = barrier;
528 spin_unlock(&ino->i_lock);
529}
530
531bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
532{
533 struct nfs_inode *nfsi = NFS_I(ino);
534 struct pnfs_layout_segment *lseg;
535 bool found = false;
536
537 spin_lock(&ino->i_lock);
538 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
539 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
540 found = true;
541 break;
542 }
543 if (!found) {
544 struct pnfs_layout_hdr *lo = nfsi->layout;
545 u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
546
547 /* Since close does not return a layout stateid for use as
548 * a barrier, we choose the worst-case barrier.
549 */
550 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
551 }
552 spin_unlock(&ino->i_lock);
553 return found;
554}
555
432/* 556/*
433 * Compare two layout segments for sorting into layout cache. 557 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those 558 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
450 574
451 dprintk("%s:Begin\n", __func__); 575 dprintk("%s:Begin\n", __func__);
452 576
453 assert_spin_locked(&lo->inode->i_lock); 577 assert_spin_locked(&lo->plh_inode->i_lock);
454 if (list_empty(&lo->segs)) { 578 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
455 struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; 579 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
456
457 spin_lock(&clp->cl_lock);
458 BUG_ON(!list_empty(&lo->layouts));
459 list_add_tail(&lo->layouts, &clp->cl_layouts);
460 spin_unlock(&clp->cl_lock);
461 }
462 list_for_each_entry(lp, &lo->segs, fi_list) {
463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
464 continue; 580 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list); 581 list_add_tail(&lseg->pls_list, &lp->pls_list);
466 dprintk("%s: inserted lseg %p " 582 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before " 583 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n", 584 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode, 585 __func__, lseg, lseg->pls_range.iomode,
470 lseg->range.offset, lseg->range.length, 586 lseg->pls_range.offset, lseg->pls_range.length,
471 lp, lp->range.iomode, lp->range.offset, 587 lp, lp->pls_range.iomode, lp->pls_range.offset,
472 lp->range.length); 588 lp->pls_range.length);
473 found = 1; 589 found = 1;
474 break; 590 break;
475 } 591 }
476 if (!found) { 592 if (!found) {
477 list_add_tail(&lseg->fi_list, &lo->segs); 593 list_add_tail(&lseg->pls_list, &lo->plh_segs);
478 dprintk("%s: inserted lseg %p " 594 dprintk("%s: inserted lseg %p "
479 "iomode %d offset %llu length %llu at tail\n", 595 "iomode %d offset %llu length %llu at tail\n",
480 __func__, lseg, lseg->range.iomode, 596 __func__, lseg, lseg->pls_range.iomode,
481 lseg->range.offset, lseg->range.length); 597 lseg->pls_range.offset, lseg->pls_range.length);
482 } 598 }
483 get_layout_hdr_locked(lo); 599 get_layout_hdr(lo);
484 600
485 dprintk("%s:Return\n", __func__); 601 dprintk("%s:Return\n", __func__);
486} 602}
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
493 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); 609 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
494 if (!lo) 610 if (!lo)
495 return NULL; 611 return NULL;
496 lo->refcount = 1; 612 atomic_set(&lo->plh_refcount, 1);
497 INIT_LIST_HEAD(&lo->layouts); 613 INIT_LIST_HEAD(&lo->plh_layouts);
498 INIT_LIST_HEAD(&lo->segs); 614 INIT_LIST_HEAD(&lo->plh_segs);
499 seqlock_init(&lo->seqlock); 615 INIT_LIST_HEAD(&lo->plh_bulk_recall);
500 lo->inode = ino; 616 lo->plh_inode = ino;
501 return lo; 617 return lo;
502} 618}
503 619
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
510 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 626 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
511 627
512 assert_spin_locked(&ino->i_lock); 628 assert_spin_locked(&ino->i_lock);
513 if (nfsi->layout) 629 if (nfsi->layout) {
514 return nfsi->layout; 630 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
515 631 return NULL;
632 else
633 return nfsi->layout;
634 }
516 spin_unlock(&ino->i_lock); 635 spin_unlock(&ino->i_lock);
517 new = alloc_init_layout_hdr(ino); 636 new = alloc_init_layout_hdr(ino);
518 spin_lock(&ino->i_lock); 637 spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
538static int 657static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 658is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{ 659{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); 660 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
542} 661}
543 662
544/* 663/*
545 * lookup range in layout 664 * lookup range in layout
546 */ 665 */
547static struct pnfs_layout_segment * 666static struct pnfs_layout_segment *
548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) 667pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
549{ 668{
550 struct pnfs_layout_segment *lseg, *ret = NULL; 669 struct pnfs_layout_segment *lseg, *ret = NULL;
551 670
552 dprintk("%s:Begin\n", __func__); 671 dprintk("%s:Begin\n", __func__);
553 672
554 assert_spin_locked(&lo->inode->i_lock); 673 assert_spin_locked(&lo->plh_inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) { 674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
556 if (is_matching_lseg(lseg, iomode)) { 675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) {
557 ret = lseg; 677 ret = lseg;
558 break; 678 break;
559 } 679 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0) 680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
561 break; 681 break;
562 } 682 }
563 683
564 dprintk("%s:Return lseg %p ref %d\n", 684 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); 685 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
566 return ret; 686 return ret;
567} 687}
568 688
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
576 enum pnfs_iomode iomode) 696 enum pnfs_iomode iomode)
577{ 697{
578 struct nfs_inode *nfsi = NFS_I(ino); 698 struct nfs_inode *nfsi = NFS_I(ino);
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
579 struct pnfs_layout_hdr *lo; 700 struct pnfs_layout_hdr *lo;
580 struct pnfs_layout_segment *lseg = NULL; 701 struct pnfs_layout_segment *lseg = NULL;
581 702
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
588 goto out_unlock; 709 goto out_unlock;
589 } 710 }
590 711
591 /* Check to see if the layout for the given range already exists */ 712 /* Do we even need to bother with this? */
592 lseg = pnfs_has_layout(lo, iomode); 713 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
593 if (lseg) { 714 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
594 dprintk("%s: Using cached lseg %p for iomode %d)\n", 715 dprintk("%s matches recall, use MDS\n", __func__);
595 __func__, lseg, iomode);
596 goto out_unlock; 716 goto out_unlock;
597 } 717 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
598 722
599 /* if LAYOUTGET already failed once we don't try again */ 723 /* if LAYOUTGET already failed once we don't try again */
600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) 724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock;
726
727 if (pnfs_layoutgets_blocked(lo, NULL, 0))
601 goto out_unlock; 728 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding);
602 730
603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ 731 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) {
733 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */
736 spin_lock(&clp->cl_lock);
737 BUG_ON(!list_empty(&lo->plh_layouts));
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock);
740 }
604 spin_unlock(&ino->i_lock); 741 spin_unlock(&ino->i_lock);
605 742
606 lseg = send_layoutget(lo, ctx, iomode); 743 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) {
745 spin_lock(&ino->i_lock);
746 if (list_empty(&lo->plh_segs)) {
747 spin_lock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 }
754 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo);
607out: 756out:
608 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 757 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
609 nfsi->layout->state, lseg); 758 nfsi->layout->plh_flags, lseg);
610 return lseg; 759 return lseg;
611out_unlock: 760out_unlock:
612 spin_unlock(&ino->i_lock); 761 spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 768 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res; 769 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg; 770 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode; 771 struct inode *ino = lo->plh_inode;
772 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
623 int status = 0; 773 int status = 0;
624 774
775 /* Verify we got what we asked for.
776 * Note that because the xdr parsing only accepts a single
777 * element array, this can fail even if the server is behaving
778 * correctly.
779 */
780 if (lgp->args.range.iomode > res->range.iomode ||
781 res->range.offset != 0 ||
782 res->range.length != NFS4_MAX_UINT64) {
783 status = -EINVAL;
784 goto out;
785 }
625 /* Inject layout blob into I/O device driver */ 786 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); 787 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) { 788 if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
635 } 796 }
636 797
637 spin_lock(&ino->i_lock); 798 spin_lock(&ino->i_lock);
799 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
800 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
801 dprintk("%s forget reply due to recall\n", __func__);
802 goto out_forget_reply;
803 }
804
805 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
806 dprintk("%s forget reply due to state\n", __func__);
807 goto out_forget_reply;
808 }
638 init_lseg(lo, lseg); 809 init_lseg(lo, lseg);
639 lseg->range = res->range; 810 lseg->pls_range = res->range;
640 *lgp->lsegpp = lseg; 811 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg); 812 pnfs_insert_layout(lo, lseg);
642 813
814 if (res->return_on_close) {
815 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
816 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
817 }
818
643 /* Done processing layoutget. Set the layout stateid */ 819 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid); 820 pnfs_set_layout_stateid(lo, &res->stateid, false);
645 spin_unlock(&ino->i_lock); 821 spin_unlock(&ino->i_lock);
646out: 822out:
647 return status; 823 return status;
824
825out_forget_reply:
826 spin_unlock(&ino->i_lock);
827 lseg->pls_layout = lo;
828 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
829 goto out;
648} 830}
649 831
650/* 832/*